Skip to content

Canonicalization

canonicalize(psmiles)

Canonicalize PSMILES string

Note

Two step process (1) Unify ([*]CCO[*] -> [*]COO[*]); apply multiple times (2) Reduce ([*]CCOCCO[*] -> [*]CCO[*]); apply multiple times

Returns:

Name Type Description
str str

canonicalized PSMILES string

Source code in canonicalize_psmiles/canonicalize.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def canonicalize(psmiles) -> str:
    r"""Canonicalize PSMILES string

    Note:
        Two step process
        (1) Unify ([\*]CCO[\*] -> [\*]COO[\*]); apply multiple times
        (2) Reduce ([\*]CCOCCO[\*] -> [\*]CCO[\*]); apply multiple times

    Returns:
        str: canonicalized PSMILES string
    """

    # Apply unify once
    new_ps = unify(psmiles)

    new_ps = add_brackets(new_ps)

    # recursively apply reduce
    for num in range(20):
        new = reduce_multiplication(new_ps)
        new = add_brackets(new)
        if new == new_ps:
            break
        new_ps = new

    return add_brackets(new_ps)

get_connection_info(mol=None, symbol='*')

Get connection information a PSMILES string.

Parameters:

Name Type Description Default
mol RWMol

description. Defaults to None.

None
symbol str

description. Defaults to "*".

'*'

Returns:

Name Type Description
Dict Dict

Dictionary containing information of the mol

Source code in canonicalize_psmiles/canonicalize.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def get_connection_info(mol=None, symbol="*") -> Dict:
    """Get connection information a PSMILES string.

    Args:
        mol (Chem.RWMol, optional): _description_. Defaults to None.
        symbol (str, optional): _description_. Defaults to "*".

    Returns:
        Dict: Dictionary containing information of the mol
    """   

    ret_dict = {}

    stars_indices, stars_type, all_symbols, all_index = [], [], [], []
    for star_idx, atom in enumerate(mol.GetAtoms()):
        all_symbols.append(atom.GetSymbol())
        all_index.append(atom.GetIdx())
        if symbol in atom.GetSymbol():
            stars_indices.append(star_idx)
            stars_type.append(atom.GetSmarts())

    stars_bond = mol.GetBondBetweenAtoms(stars_indices[0], stars_indices[1])
    if stars_bond:
        stars_bond = stars_bond.GetBondType()

    ret_dict["symbols"] = all_symbols
    ret_dict["index"] = all_index

    ret_dict["star"] = {
        "index": stars_indices,
        "atom_type": stars_type,
        "bond_type": stars_bond,
    }

    # multiple neighbors are possible
    neighbor_indices = [
        [x.GetIdx() for x in mol.GetAtomWithIdx(stars_indices[0]).GetNeighbors()],
        [x.GetIdx() for x in mol.GetAtomWithIdx(stars_indices[1]).GetNeighbors()],
    ]

    neighbors_type = [
        [mol.GetAtomWithIdx(x).GetSmarts() for x in neighbor_indices[0]],
        [mol.GetAtomWithIdx(x).GetSmarts() for x in neighbor_indices[1]],
    ]

    # Bonds between stars and neighbors
    neighbor_bonds = [
        [
            mol.GetBondBetweenAtoms(stars_indices[0], x).GetBondType()
            for x in neighbor_indices[0]
        ],
        [
            mol.GetBondBetweenAtoms(stars_indices[1], x).GetBondType()
            for x in neighbor_indices[1]
        ],
    ]
    s_path = None
    if neighbor_indices[0][0] != neighbor_indices[1][0]:
        s_path = Chem.GetShortestPath(
            mol, neighbor_indices[0][0], neighbor_indices[1][0]
        )

    ret_dict["neighbor"] = {
        "index": neighbor_indices,
        "atom_type": neighbors_type,
        "bond_type": neighbor_bonds,
        "path": s_path,
    }

    # Stereo info
    stereo_info = []
    for b in mol.GetBonds():
        bond_type = b.GetStereo()
        if bond_type != Chem.rdchem.BondStereo.STEREONONE:
            idx = [b.GetBeginAtomIdx(), b.GetEndAtomIdx()]
            neigh_idx = b.GetStereoAtoms()
            stereo_info.append(
                {
                    "bond_type": bond_type,
                    "atom_idx": idx,
                    "bond_idx": b.GetIdx(),
                    "neighbor_idx": list(neigh_idx),
                }
            )

    ret_dict["stereo"] = stereo_info

    # Ring info
    ring_info = mol.GetRingInfo()
    ret_dict["atom_rings"] = ring_info.AtomRings()
    ret_dict["bond_rings"] = ring_info.BondRings()

    return ret_dict

get_index_to_break(mol, info)

Get indices of two connected atoms that can be broken. Indices cannot be part of an original ring.

Parameters:

Name Type Description Default
mol RWMol

[description]

required
info Dict

[description]

required

Returns:

Type Description
Tuple[int, int]

Tuple[int, int]: [description]

Source code in canonicalize_psmiles/canonicalize.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
def get_index_to_break(mol: Chem.RWMol, info: Dict) -> Tuple[int, int]:
    """Get indices of two connected atoms that can be broken.
    Indices cannot be part of an original ring.

    Args:
        mol (Chem.RWMol): [description]
        info (Dict): [description]

    Returns:
        Tuple[int, int]: [description]
    """
    # Find index of N1 and N2

    n_idx = []
    for atom in mol.GetAtoms():
        if "atomLabel" in atom.GetPropsAsDict() and "N" in atom.GetProp("atomLabel"):
            n_idx.append(atom.GetIdx())

    # Get the original bonds to make sure we don't break them
    # (1) break N1-N2 bond
    # (2) find rings
    # (3) add bond back
    bnd = mol.GetBondBetweenAtoms(n_idx[0], n_idx[1]).GetBondType()

    mol.RemoveBond(n_idx[0], n_idx[1])
    # mol.ClearComputedProps()
    GetSymmSSSR(mol)
    original_atom_rings = mol.GetRingInfo().AtomRings()
    original_bond_rings = mol.GetRingInfo().BondRings()

    # Add bond back
    mol.AddBond(n_idx[0], n_idx[1], bnd)
    # mol.ClearComputedProps()
    GetSymmSSSR(mol)
    new_atom_rings = mol.GetRingInfo().AtomRings()
    new_bond_rings = mol.GetRingInfo().BondRings()

    # Find rings with N1 and N2 that have the computed length
    ring_length_to_break = len(info["neighbor"]["path"])

    rings_match, bond_rings_match = [], []
    for atom_ring, bond_ring in zip(new_atom_rings, new_bond_rings):
        if (
            set(atom_ring).issuperset(set(n_idx))
            and len(atom_ring) == ring_length_to_break
        ):
            rings_match.append(atom_ring)
            bond_rings_match.append(bond_ring)

    logging.debug(
        f"Found {len(rings_match)} rings that contain N1 and N2 and have length {ring_length_to_break}"
    )

    if len(rings_match) == 0:
        # Remove ring_length_to_break criteria
        rings_match, bond_rings_match = [], []
        for atom_ring, bond_ring in zip(new_atom_rings, new_bond_rings):
            if set(atom_ring).issuperset(set(n_idx)):
                rings_match.append(atom_ring)
                bond_rings_match.append(bond_ring)

        logging.debug(f"Found {len(rings_match)} rings that contain N1 and N2")

    def bond_in_other_rings(bond_index):
        # check rings that do not match
        # for _ring in bond_rings_no_match:
        # if bond_index in _ring:
        # return True
        # Check the original rings; they must stay intact
        for _ring in original_bond_rings:
            if bond_index in _ring:
                return True
        return False

    def get_break_idx(ring_to_break):
        # start to search with lower index first
        sorted_ring = sorted(ring_to_break)

        logging.debug(f"Searching in ring (sorted index): {sorted_ring}")
        for idx_1 in sorted_ring:

            # get neighbors of idx_1
            possible_neighbors = [
                x.GetIdx() for x in mol.GetAtomWithIdx(int(idx_1)).GetNeighbors()
            ]

            # Sort possible neighbors, rdkit does note guarantee the order
            possible_neighbors = sorted(possible_neighbors)

            for idx_2 in possible_neighbors:
                bond_idx = mol.GetBondBetweenAtoms(idx_1, idx_2).GetIdx()
                # print(idx_1, idx_2, bond_idx, bond_in_other_rings(bond_idx))

                # idx_2 cannot be in original rings but must be in sorted_ring
                if idx_2 in sorted_ring and not bond_in_other_rings(bond_idx):

                    logging.debug(
                        f"Index {idx_1} and {idx_2} are in ring and connected with bond {bond_idx}. "
                        f"Bond {bond_idx} is not in other rings. Break."
                    )
                    return idx_1, idx_2

        raise UserWarning("Canonicalization failed.")

    # We need to check all ring, sort by size
    logging.debug(f"Possible rings to break: {rings_match}")

    # TODO: There might be multiple rings that can be broken; do we need to sort here?
    # print(rings_match)
    rings_match = sorted(rings_match, key=sum)
    # print(rings_match)

    for ring_to_break in rings_match:
        break_idx_1, break_idx_2 = get_break_idx(ring_to_break)
        if break_idx_1 is not None and break_idx_2 is not None:
            # We found indices
            break

    # if ring has only 3 atoms break at N1 and N2
    if len(ring_to_break) == 3 and len(info["symbols"]) > 5:
        break_idx_1 = n_idx[0]
        break_idx_2 = n_idx[1]

    return break_idx_1, break_idx_2

get_mol(psmiles)

Returns a RDKit mol object.

Note

In jupyter notebooks, this function draws the SMILES string

Returns:

Type Description
RWMol

Chem.MolFromSmiles: Mol object

Source code in canonicalize_psmiles/canonicalize.py
436
437
438
439
440
441
442
443
444
445
def get_mol(psmiles) -> Chem.RWMol:
    """Returns a RDKit mol object.

    Note:
        In jupyter notebooks, this function draws the SMILES string

    Returns:
        Chem.MolFromSmiles: Mol object
    """
    return Chem.RWMol(Chem.MolFromSmiles(psmiles))

nb_display(mol)

Helper function to display polymer for debug

Source code in canonicalize_psmiles/canonicalize.py
448
449
450
451
452
453
454
def nb_display(mol):
    """Helper function to display polymer for debug"""
    print(f"SMILES: {Chem.MolToCXSmiles(mol)}")
    try:
        display(mol)
    except:
        pass

reduce_multiplication(psmiles)

Reduces PSMILES repetitions. Apply multiple times to reduce to smallest PSMILES.

Examples:

[*]CCC[*] --> [*]C[*] [*]COCO[*] --> [*]CO[*] [*]COCCOCCOCCOC[*] -> [*]COCCOC[*]

Returns:

Name Type Description
str str

reduced PSMILES

Source code in canonicalize_psmiles/canonicalize.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def reduce_multiplication(psmiles: str) -> str:
    r"""Reduces PSMILES repetitions.
    Apply multiple times to reduce to smallest PSMILES.

    Examples:
        [\*]CCC[\*] --> [\*]C[\*]
        [\*]COCO[\*] --> [\*]CO[\*]
        [\*]COCCOCCOCCOC[\*] -> [\*]COCCOC[\*]

    Returns:
        str: reduced PSMILES
    """
    sm = psmiles

    # skip if rings is polymer
    if re.findall(r"\d", sm):
        return psmiles

    # Handle the case of PE
    pe_test = list(set(sm.replace("[*]", "").upper()))
    if len(pe_test) == 1 and pe_test[0] == "C":
        return "[*]C[*]"

    # All other cases
    d = {}
    for sublen in range(1, int(len(sm) / 2)):
        for i in range(0, len(sm) - sublen):
            sub = sm[i : i + sublen]
            cnt = sm.count(sub)
            if cnt >= 2 and sub not in d:
                if "[" not in sub and "]" not in sub and "*" not in sub:
                    d[sub] = cnt
    if not d:
        return psmiles

    longest_string = sorted(d, key=lambda k: len(k), reverse=True)[0]
    check_s = sm.replace("[*]", "").replace(longest_string, "")

    if not check_s:
        sm = sm.replace(longest_string, "", d[longest_string] - 1)

    return sm

unify(psmiles)

Unify PSMILES strings

Examples:

[*]COC[*] --> [*]COC[*] [*]OCC[*] --> [*]COC[*] [*]COC[*] --> [*]COC[*]

Returns:

Name Type Description
str str

unified polymer SMILES string

Source code in canonicalize_psmiles/canonicalize.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def unify(psmiles: str) -> str:
    r"""Unify PSMILES strings

    Examples:
        [\*]COC[\*] --> [\*]COC[\*]
        [\*]OCC[\*] --> [\*]COC[\*]
        [\*]COC[\*] --> [\*]COC[\*]

    Returns:
        str: unified polymer SMILES string
    """

    # Make atom indices visible if in DEBUG mode
    if logging.DEBUG >= logging.root.level:
        from rdkit.Chem.Draw import IPythonConsole

        IPythonConsole.drawOptions.addAtomIndices = True
        # IPythonConsole.drawOptions.addBondIndices = True

    # (1) Get connection info
    mol = get_mol(psmiles)

    info = get_connection_info(mol)
    logging.debug(f"(1) Set labels and get connection dict \n {pprint.pformat(info)}")

    # SPECIAL CASES:
    # (1.1) if *C* return *C*
    if psmiles == "[*]C[*]":
        logging.debug("Found [*]C[*]; returning [*]C[*]")
        return Chem.MolToSmiles(mol)

    # (1.2) if neighbors are already connected PS or '[*]/C=C(\[*])C(C)(C)C'
    if info["neighbor"]["path"] and len(info["neighbor"]["path"]) == 2:
        logging.debug(f"Neighbors are already connected. Return unified SMILES here.")
        return Chem.MolToSmiles(mol)

    # (1.3) if *C(*)(F)F: neighbors are the same
    if info["neighbor"]["index"][0][0] == info["neighbor"]["index"][1][0]:
        logging.debug("Neighbors are the same. Returning unified SMILES here")
        return Chem.MolToSmiles(mol)

    mol.GetAtomWithIdx(info["neighbor"]["index"][0][0]).SetProp("atomLabel", "N1")
    mol.GetAtomWithIdx(info["neighbor"]["index"][1][0]).SetProp("atomLabel", "N2")
    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    # (2) Connect neighbors with bond type of neighbors
    logging.debug(
        f"(2) Add bond between neighbors {info['neighbor']['index']}; bond type {info['neighbor']['bond_type'][0][0]} {info['neighbor']['bond_type'][1][0]} "
    )

    mol.AddBond(
        info["neighbor"]["index"][0][0],
        info["neighbor"]["index"][1][0],
        info["neighbor"]["bond_type"][0][0],
    )
    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    # (3) Remove stars and bonds
    logging.debug(f"(3) Remove stars and bonds")
    mol.RemoveBond(info["star"]["index"][0], info["neighbor"]["index"][0][0])
    mol.RemoveBond(info["star"]["index"][1], info["neighbor"]["index"][1][0])
    mol.RemoveAtom(info["star"]["index"][1])
    mol.RemoveAtom(info["star"]["index"][0])
    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    # (4) Canonicalize
    # We have a cyclic molecule now that we can canonicalize
    # Because it is cyclic, it will always give the same SMILES
    logging.debug("(4) Canonicalize")
    Chem.Kekulize(mol, clearAromaticFlags=True)

    sm = Chem.MolToCXSmiles(mol)
    mol = Chem.RWMol(Chem.MolFromSmiles(sm))
    Chem.Kekulize(mol, clearAromaticFlags=True)

    # FIX GH #8 idempotence of some PSMILES
    Chem.Kekulize(mol, clearAromaticFlags=True)
    sm = Chem.MolToCXSmiles(mol)
    mol = Chem.RWMol(Chem.MolFromSmiles(sm))


    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    # (5) Find bond to break
    # Kekulize before breaking the bond (find conjugated bonds)
    Chem.Kekulize(mol, clearAromaticFlags=True)

    # Find the bond and get bond type
    b0_break, b1_break = get_index_to_break(mol, info)
    btype_removed = mol.GetBondBetweenAtoms(b0_break, b1_break).GetBondType()
    if btype_removed == Chem.rdchem.BondType.AROMATIC:
        conjugated = mol.GetBondBetweenAtoms(b0_break, b1_break).GetIsConjugated()
        if conjugated:
            btype_removed = Chem.rdchem.BondType.DOUBLE
        else:
            btype_removed = Chem.rdchem.BondType.SINGLE

    logging.debug(
        f"(5) Break bond at atom idx {b0_break} and {b1_break}. Bond type was {btype_removed}"
    )

    mol.RemoveBond(b0_break, b1_break)
    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    # (6) Add stars
    logging.debug(
        f"(6) Add stars at index {b0_break} and {b1_break} "
        f"with bond type {btype_removed}. "
        f"Renumber atoms."
    )
    idx = mol.AddAtom(Chem.AtomFromSmarts("*"))
    mol.AddBond(b0_break, idx, btype_removed)

    idx = mol.AddAtom(Chem.AtomFromSmarts("*"))
    mol.AddBond(b1_break, idx, btype_removed)

    # Renumber atoms
    sm = Chem.MolToSmiles(mol)
    mol = Chem.MolFromSmiles(sm)

    if logging.DEBUG >= logging.root.level:
        nb_display(mol)

    return Chem.MolToSmiles(mol)