From c3ad178fcffcb1e8976cde7395c0ee0766053ee5 Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Thu, 13 Nov 2025 12:25:11 -0500 Subject: [PATCH 1/9] first attempt at fixing regularization --- rmgpy/data/kinetics/family.py | 125 ++++++++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 21 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index f57ce12b2e..212c1637c4 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -2960,10 +2960,17 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, out_exts[-1].append(exts[i]) # this extension splits reactions (optimization dim) if typ == 'atomExt': reg_dict[(typ, indc)][0].extend(grp2.atoms[indc[0]].atomtype) + #still pass in the regularization data to the grp2. However, this doesn't take care of the grpc + #reg_dict[(typ, indc)][1].extend(grp2.atoms[indc[0]].atomtype) + #now take care of the compliment: + elif typ == 'elExt': reg_dict[(typ, indc)][0].extend(grp2.atoms[indc[0]].radical_electrons) + #reg_dict[(typ, indc)][1].extend(grp2.atoms[indc[0]].radical_electrons) elif typ == 'bondExt': reg_dict[(typ, indc)][0].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order) + #reg_dict[(typ, indc)][1].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order) + elif boo: # this extension matches all reactions (regularization dim) if typ == 'intNewBondExt' or typ == 'extNewBondExt': @@ -2990,10 +2997,12 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, reg_val = reg_dict[(typr, indcr)] if first_time and parent.children == []: - # parent + + #parent if typr != 'intNewBondExt' and typr != 'extNewBondExt': # these dimensions should be regularized if typr == 'atomExt': - grp.atoms[indcr[0]].reg_dim_atm = list(reg_val) + pass + #grp.atoms[indcr[0]].reg_dim_atm = list(reg_val) elif typr == 'elExt': grp.atoms[indcr[0]].reg_dim_u = list(reg_val) elif typr == 'ringExt': @@ -3087,6 +3096,32 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, out.extend(x) return out, gave_up_split + + def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext): + rxns_from_parent = template_rxn_map[parent.label] + new_ext_rxns, comp_ext_rxns, _ = self._split_reactions(rxns_from_parent, new_ext) + atom_labeling_in_comp_rxns = dict() + unlabeled_atoms_in_comp_rxns = [] + for rxn_c in comp_ext_rxns: + for reactant in rxn_c.reactants: + for mol in reactant.molecule: + for atm in mol.atoms: + if atm.label == '': + #this atom was unlabeled + unlabeled_atmtype = ATOMTYPES[atm.symbol] + if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: + unlabeled_atoms_in_comp_rxns.append(unlabeled_atmtype) + else: + atm_label = int(atm.label.replace('*','')) + if atm_label not in atom_labeling_in_comp_rxns.keys(): + atom_labeling_in_comp_rxns[atm_label] = [ATOMTYPES[atm.symbol]] + else: + existing_atomtypes = atom_labeling_in_comp_rxns[atm_label] + existing_atomtypes.append(ATOMTYPES[atm.symbol]) + #print(f'count of missing * is {count}') + atom_labeling_in_comp_rxns_set = {k: set(v) for k, v in atom_labeling_in_comp_rxns.items()} + + return atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np.inf, iter_item_cap=np.inf): """ @@ -3163,9 +3198,46 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. extname = ext[2] + print(extname, ext[3]) if ext[3] == 'atomExt': - ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] - elif ext[3] == 'elExt': + ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] #passing regularization information to the selected extension node + + #handling regularization in complement below + atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns = self.get_compliment_reg_dim(parent, template_rxn_map, ext[0], ext[1]) + #print(ext[0].atoms[ext[4][0]], ext[0].atoms[ext[4][0]].label, ext[1].atoms[ext[4][0]], ext[1].atoms[ext[4][0]].label) + + #regularize the atom in which the extension was performed on + if ext[1] is not None: + if ext[1].atoms[ext[4][0]].label=='': + #extension was performed on an unlabeled atom + limited_atomtypes_comp = set(ext[1].atoms[ext[4][0]].atomtype).intersection(set(unlabeled_atoms_in_comp_rxns)) + #print(ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)) + ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)] + else: + adjusted_index = int(ext[1].atoms[ext[4][0]].label.replace('*','')) #i.e. ext[4]= (3,), ext[4][0] = 3, ext[0].atoms[3]=, ext[0].atoms[3].label = '*5' + ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(atom_labeling_in_comp_rxns_set[adjusted_index])] + + #make sure the rest of the atoms in the extension take on the same regularization dimensions as the parent. + for i, parent_atm in enumerate(parent.item.atoms): + if i == ext[4][0]: + print('extension atom') + continue #this is the atom that the extension is focused on, handled above + elif parent_atm.reg_dim_atm[1]==[]: + print('parent atm reg_dim is empty') + continue #only take on regularization dimensions of parent if there is some + else: + ext[0].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the extension + if ext[1] is not None: + ext[1].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the complimentary extension + + + #print(ext[1].atoms[i].atomtype,' ', ext[1].atoms[i].reg_dim_atm[1]) + + + + # print(ext[1].atoms[ext[4][0]].atomtype, ) + # ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, ext[1].atoms[ext[4][0]].atomtype] #must also pass regularization information to the compliment + if ext[3] == 'elExt': ext[0].atoms[ext[4][0]].reg_dim_u = [ext[0].atoms[ext[4][0]].radical_electrons, ext[0].atoms[ext[4][0]].radical_electrons] @@ -3251,8 +3323,11 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. if complement: template_rxn_map[parent.label] = [] template_rxn_map[cextname] = comp_entries + if cextname=="Root_N-4R!H->O": + print(f'end of extend_node: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') else: template_rxn_map[parent.label] = comp_entries + return True def generate_tree(self, rxns=None, obj=None, thermo_database=None, T=1000.0, nprocs=1, min_splitable_entry_num=2, @@ -3321,6 +3396,8 @@ def rxnkey(rxn): logging.error("built tree with {} nodes".format(len(list(self.groups.entries)))) self.auto_generated = True + print(f'end of generate_tree: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') + def get_rxn_batches(self, rxns, T=1000.0, max_batch_size=800, outlier_fraction=0.02, stratum_num=8): """ @@ -3488,6 +3565,8 @@ def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, d continue boo2 = self.extend_node(entry, template_rxn_map, obj, T, iter_max=extension_iter_max, iter_item_cap=extension_iter_item_cap) if boo2: # extended node so restart while loop + # if "Root_N-4R!H->O" in template_rxn_map.keys(): + # print(f'at boo2: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') break else: # no extensions could be generated since all reactions were identical mult_completed_nodes.append(entry) @@ -3519,6 +3598,8 @@ def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, d entry.parent = self.groups.entries[pname] entry.parent.children.append(entry) + print(f'end of make_tree_nodes: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') + return def _absorb_process(self, p, conn, name): @@ -3787,9 +3868,10 @@ def simple_regularization(self, node, template_rxn_map, test=True): self.simple_regularization(child, template_rxn_map) grp = node.item + parent = node.parent.item rxns = template_rxn_map[node.label] - R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br'] # set of possible R elements/atoms + R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br', 'Li'] # set of possible R elements/atoms R = [ATOMTYPES[x] for x in R] RnH = R[:] @@ -3804,14 +3886,15 @@ def simple_regularization(self, node, template_rxn_map, test=True): for i, atm1 in enumerate(grp.atoms): skip = False - if node.children == []: # if the atoms or bonds are graphically indistinguishable don't regularize - bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()} - for atm2 in grp.atoms: - if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds): - bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()} - if bdpairs == bdpairs2: - skip = True - indistinguishable.append(i) + if i <= len(parent.atoms)-1: #if we aren't at an atom definition that the parent node doesn't have (due to this child being an extNewBondExt type) + if node.children == [] and parent.atoms[i].reg_dim_atm[1]==[]: # if the atoms or bonds are graphically indistinguishable don't regularize + bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()} + for atm2 in grp.atoms: + if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds): + bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()} + if bdpairs == bdpairs2: + skip = True + indistinguishable.append(i) if not skip and atm1.reg_dim_atm[1] != [] and set(atm1.reg_dim_atm[1]) != set(atm1.atomtype): atyp = atm1.atomtype @@ -3823,14 +3906,14 @@ def simple_regularization(self, node, template_rxn_map, test=True): vals = list(set(atyp) & set(atm1.reg_dim_atm[1])) assert vals != [], 'cannot regularize to empty' - if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]): - if not test: - atm1.atomtype = vals - else: - oldvals = atm1.atomtype - atm1.atomtype = vals - if not self.rxns_match_node(node, rxns): - atm1.atomtype = oldvals + #if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]): + if not test: + atm1.atomtype = vals + else: + oldvals = atm1.atomtype + atm1.atomtype = vals + if not self.rxns_match_node(node, rxns): + atm1.atomtype = oldvals if not skip and atm1.reg_dim_u[1] != [] and set(atm1.reg_dim_u[1]) != set(atm1.radical_electrons): if len(atm1.radical_electrons) == 1: From 8a92e909326a805f0609a85c75322c5e0f60201e Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Thu, 13 Nov 2025 14:05:50 -0500 Subject: [PATCH 2/9] cleaned up and commented --- rmgpy/data/kinetics/family.py | 68 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 212c1637c4..2509084f4b 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -3001,8 +3001,7 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, #parent if typr != 'intNewBondExt' and typr != 'extNewBondExt': # these dimensions should be regularized if typr == 'atomExt': - pass - #grp.atoms[indcr[0]].reg_dim_atm = list(reg_val) + pass #no longer passing regularization info to the parent here. Doing this instead in `extend_node` elif typr == 'elExt': grp.atoms[indcr[0]].reg_dim_u = list(reg_val) elif typr == 'ringExt': @@ -3098,10 +3097,27 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, return out, gave_up_split def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext): + """ + Function takes in a parent node (`parent`), an extension node (`new_ext`) and its compliment (`comp_ext`). + Reactions of the parent node are split to extension and compliment. + Iterating over all the reactions that fit the complimentary node, the atomtypes of each labeled atom in each reaction are saved to a dictionary `atom_labeling_in_comp_rxns`, + where the key is the integer of the atom label (i.e. 5 in '*5') and the value is a set of all the atomtypes in all the complimentary reactions with that atom label. + + Additionally, when iterating over all the reactions that fit the complimentary node, the atomtypes of each unlabeled atom in each reaction are saved to a list `unlabeled_atoms_in_comp_rxns`. + """ + + + assert comp_ext is not None, "This extension does not include a complimentary node. Cannot get regularization dimensions of complimentary node." + + #divide parent reactions into the extension node and its compliment rxns_from_parent = template_rxn_map[parent.label] new_ext_rxns, comp_ext_rxns, _ = self._split_reactions(rxns_from_parent, new_ext) + + #for saving data atom_labeling_in_comp_rxns = dict() unlabeled_atoms_in_comp_rxns = [] + + #iterate through each complimentary rxn for rxn_c in comp_ext_rxns: for reactant in rxn_c.reactants: for mol in reactant.molecule: @@ -3112,13 +3128,13 @@ def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext): if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: unlabeled_atoms_in_comp_rxns.append(unlabeled_atmtype) else: + #this is a labeled atom atm_label = int(atm.label.replace('*','')) if atm_label not in atom_labeling_in_comp_rxns.keys(): atom_labeling_in_comp_rxns[atm_label] = [ATOMTYPES[atm.symbol]] else: existing_atomtypes = atom_labeling_in_comp_rxns[atm_label] existing_atomtypes.append(ATOMTYPES[atm.symbol]) - #print(f'count of missing * is {count}') atom_labeling_in_comp_rxns_set = {k: set(v) for k, v in atom_labeling_in_comp_rxns.items()} return atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns @@ -3198,45 +3214,35 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. extname = ext[2] - print(extname, ext[3]) + if ext[3] == 'atomExt': ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] #passing regularization information to the selected extension node - #handling regularization in complement below + #handling regularization in complement below: atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns = self.get_compliment_reg_dim(parent, template_rxn_map, ext[0], ext[1]) - #print(ext[0].atoms[ext[4][0]], ext[0].atoms[ext[4][0]].label, ext[1].atoms[ext[4][0]], ext[1].atoms[ext[4][0]].label) #regularize the atom in which the extension was performed on - if ext[1] is not None: - if ext[1].atoms[ext[4][0]].label=='': - #extension was performed on an unlabeled atom - limited_atomtypes_comp = set(ext[1].atoms[ext[4][0]].atomtype).intersection(set(unlabeled_atoms_in_comp_rxns)) - #print(ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)) - ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)] - else: - adjusted_index = int(ext[1].atoms[ext[4][0]].label.replace('*','')) #i.e. ext[4]= (3,), ext[4][0] = 3, ext[0].atoms[3]=, ext[0].atoms[3].label = '*5' - ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(atom_labeling_in_comp_rxns_set[adjusted_index])] - - #make sure the rest of the atoms in the extension take on the same regularization dimensions as the parent. + if ext[1].atoms[ext[4][0]].label=='': + #extension was performed on an unlabeled atom, so pass in regularization dimensions that are at least limited to the atomtypes of all the unlabeled atoms + limited_atomtypes_comp = set(ext[1].atoms[ext[4][0]].atomtype).intersection(set(unlabeled_atoms_in_comp_rxns)) + ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)] + else: + #extension was performed on a labeled atom. For each labeled atom, we know all the atomtypes in the training reactions. Let's limit regularization dimensions to these known atomtypes + adjusted_index = int(ext[1].atoms[ext[4][0]].label.replace('*','')) #i.e. ext[4]= (3,), ext[4][0] = 3, ext[0].atoms[3]=, ext[0].atoms[3].label = '*5' + ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(atom_labeling_in_comp_rxns_set[adjusted_index])] + + #make sure the rest of the atoms in the extension take on the same regularization dimensions as the parent. Ensures subgraph isomorphism. for i, parent_atm in enumerate(parent.item.atoms): if i == ext[4][0]: - print('extension atom') - continue #this is the atom that the extension is focused on, handled above + continue #this is the atom that the extension is focused on, handled above if the extension was an 'atomExt' extension type elif parent_atm.reg_dim_atm[1]==[]: - print('parent atm reg_dim is empty') continue #only take on regularization dimensions of parent if there is some else: ext[0].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the extension - if ext[1] is not None: + if ext[1] is not None: #check if there's a complimentary node ext[1].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the complimentary extension - - #print(ext[1].atoms[i].atomtype,' ', ext[1].atoms[i].reg_dim_atm[1]) - - - # print(ext[1].atoms[ext[4][0]].atomtype, ) - # ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, ext[1].atoms[ext[4][0]].atomtype] #must also pass regularization information to the compliment if ext[3] == 'elExt': ext[0].atoms[ext[4][0]].reg_dim_u = [ext[0].atoms[ext[4][0]].radical_electrons, ext[0].atoms[ext[4][0]].radical_electrons] @@ -3323,8 +3329,6 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. if complement: template_rxn_map[parent.label] = [] template_rxn_map[cextname] = comp_entries - if cextname=="Root_N-4R!H->O": - print(f'end of extend_node: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') else: template_rxn_map[parent.label] = comp_entries @@ -3396,8 +3400,6 @@ def rxnkey(rxn): logging.error("built tree with {} nodes".format(len(list(self.groups.entries)))) self.auto_generated = True - print(f'end of generate_tree: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') - def get_rxn_batches(self, rxns, T=1000.0, max_batch_size=800, outlier_fraction=0.02, stratum_num=8): """ @@ -3565,8 +3567,6 @@ def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, d continue boo2 = self.extend_node(entry, template_rxn_map, obj, T, iter_max=extension_iter_max, iter_item_cap=extension_iter_item_cap) if boo2: # extended node so restart while loop - # if "Root_N-4R!H->O" in template_rxn_map.keys(): - # print(f'at boo2: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') break else: # no extensions could be generated since all reactions were identical mult_completed_nodes.append(entry) @@ -3598,8 +3598,6 @@ def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, d entry.parent = self.groups.entries[pname] entry.parent.children.append(entry) - print(f'end of make_tree_nodes: {self.groups.entries["Root_N-4R!H->O"].item.atoms[3].reg_dim_atm}') - return def _absorb_process(self, p, conn, name): From 07750e0126ab8525d3daea1bec915e0e8412046a Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Thu, 13 Nov 2025 16:07:01 -0500 Subject: [PATCH 3/9] more clean up --- rmgpy/data/kinetics/family.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 2509084f4b..9eb53992a6 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -2960,16 +2960,10 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, out_exts[-1].append(exts[i]) # this extension splits reactions (optimization dim) if typ == 'atomExt': reg_dict[(typ, indc)][0].extend(grp2.atoms[indc[0]].atomtype) - #still pass in the regularization data to the grp2. However, this doesn't take care of the grpc - #reg_dict[(typ, indc)][1].extend(grp2.atoms[indc[0]].atomtype) - #now take care of the compliment: - elif typ == 'elExt': reg_dict[(typ, indc)][0].extend(grp2.atoms[indc[0]].radical_electrons) - #reg_dict[(typ, indc)][1].extend(grp2.atoms[indc[0]].radical_electrons) elif typ == 'bondExt': reg_dict[(typ, indc)][0].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order) - #reg_dict[(typ, indc)][1].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order) elif boo: # this extension matches all reactions (regularization dim) From 3219e401705854d81555ba6f6d4154ad36bd282e Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Fri, 14 Nov 2025 15:13:05 -0500 Subject: [PATCH 4/9] pulling actual atomtypes from reactions in the complimentary node instead of using their atom symbol to make an atomtype --- rmgpy/data/kinetics/family.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 9eb53992a6..68523b1036 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -3118,17 +3118,17 @@ def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext): for atm in mol.atoms: if atm.label == '': #this atom was unlabeled - unlabeled_atmtype = ATOMTYPES[atm.symbol] - if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: + unlabeled_atmtype = atm.atomtype + if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: unlabeled_atoms_in_comp_rxns.append(unlabeled_atmtype) else: #this is a labeled atom atm_label = int(atm.label.replace('*','')) if atm_label not in atom_labeling_in_comp_rxns.keys(): - atom_labeling_in_comp_rxns[atm_label] = [ATOMTYPES[atm.symbol]] + atom_labeling_in_comp_rxns[atm_label] = [atm.atomtype] else: existing_atomtypes = atom_labeling_in_comp_rxns[atm_label] - existing_atomtypes.append(ATOMTYPES[atm.symbol]) + existing_atomtypes.append(atm.atomtype) atom_labeling_in_comp_rxns_set = {k: set(v) for k, v in atom_labeling_in_comp_rxns.items()} return atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns From f4c81d8b3395c8cea912e29e3e01f4a14bbb7010 Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Wed, 19 Nov 2025 12:33:55 -0500 Subject: [PATCH 5/9] Scrapping previous changes/attempts to fix bug. Starting fix that allows problematic nodes to generate atomExt extensions that aren't node splitting if the optimization dimension of the regularization dictionary is more specific than the atomtype at the atom of interest being extended. For example, if the atomtype of an atom labeled *5 is [Si, F, Li, N, C, P, S] and the regulatization dictionary has an optimization dimension that narrows down these atomtypes (i.e. reg_dim_atm[0] = ), then we can allow for atomExt extensions that change *5's atomtype to be [N,C] (rather than just [N] or just [C]). This way, we have an extension that narrows down *5 to from [Si, F, Li, N, C, P, S] but also matches all of the training reactions at the node, so the regularization information (reg_dim_atm{1]) is passed to the group. --- rmgpy/data/kinetics/family.py | 117 ++++++---------------------------- rmgpy/molecule/group.py | 30 ++++++++- 2 files changed, 50 insertions(+), 97 deletions(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 68523b1036..f57ce12b2e 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -2965,7 +2965,6 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, elif typ == 'bondExt': reg_dict[(typ, indc)][0].extend(grp2.get_bond(grp2.atoms[indc[0]], grp2.atoms[indc[1]]).order) - elif boo: # this extension matches all reactions (regularization dim) if typ == 'intNewBondExt' or typ == 'extNewBondExt': # these are bond formation extensions, we want to expand these until we get splits @@ -2991,11 +2990,10 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, reg_val = reg_dict[(typr, indcr)] if first_time and parent.children == []: - - #parent + # parent if typr != 'intNewBondExt' and typr != 'extNewBondExt': # these dimensions should be regularized if typr == 'atomExt': - pass #no longer passing regularization info to the parent here. Doing this instead in `extend_node` + grp.atoms[indcr[0]].reg_dim_atm = list(reg_val) elif typr == 'elExt': grp.atoms[indcr[0]].reg_dim_u = list(reg_val) elif typr == 'ringExt': @@ -3089,49 +3087,6 @@ def get_extension_edge(self, parent, template_rxn_map, obj, T, iter_max=np.inf, out.extend(x) return out, gave_up_split - - def get_compliment_reg_dim(self, parent, template_rxn_map, new_ext, comp_ext): - """ - Function takes in a parent node (`parent`), an extension node (`new_ext`) and its compliment (`comp_ext`). - Reactions of the parent node are split to extension and compliment. - Iterating over all the reactions that fit the complimentary node, the atomtypes of each labeled atom in each reaction are saved to a dictionary `atom_labeling_in_comp_rxns`, - where the key is the integer of the atom label (i.e. 5 in '*5') and the value is a set of all the atomtypes in all the complimentary reactions with that atom label. - - Additionally, when iterating over all the reactions that fit the complimentary node, the atomtypes of each unlabeled atom in each reaction are saved to a list `unlabeled_atoms_in_comp_rxns`. - """ - - - assert comp_ext is not None, "This extension does not include a complimentary node. Cannot get regularization dimensions of complimentary node." - - #divide parent reactions into the extension node and its compliment - rxns_from_parent = template_rxn_map[parent.label] - new_ext_rxns, comp_ext_rxns, _ = self._split_reactions(rxns_from_parent, new_ext) - - #for saving data - atom_labeling_in_comp_rxns = dict() - unlabeled_atoms_in_comp_rxns = [] - - #iterate through each complimentary rxn - for rxn_c in comp_ext_rxns: - for reactant in rxn_c.reactants: - for mol in reactant.molecule: - for atm in mol.atoms: - if atm.label == '': - #this atom was unlabeled - unlabeled_atmtype = atm.atomtype - if unlabeled_atmtype not in unlabeled_atoms_in_comp_rxns: - unlabeled_atoms_in_comp_rxns.append(unlabeled_atmtype) - else: - #this is a labeled atom - atm_label = int(atm.label.replace('*','')) - if atm_label not in atom_labeling_in_comp_rxns.keys(): - atom_labeling_in_comp_rxns[atm_label] = [atm.atomtype] - else: - existing_atomtypes = atom_labeling_in_comp_rxns[atm_label] - existing_atomtypes.append(atm.atomtype) - atom_labeling_in_comp_rxns_set = {k: set(v) for k, v in atom_labeling_in_comp_rxns.items()} - - return atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np.inf, iter_item_cap=np.inf): """ @@ -3208,36 +3163,9 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. extname = ext[2] - if ext[3] == 'atomExt': - ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] #passing regularization information to the selected extension node - - #handling regularization in complement below: - atom_labeling_in_comp_rxns_set, unlabeled_atoms_in_comp_rxns = self.get_compliment_reg_dim(parent, template_rxn_map, ext[0], ext[1]) - - #regularize the atom in which the extension was performed on - if ext[1].atoms[ext[4][0]].label=='': - #extension was performed on an unlabeled atom, so pass in regularization dimensions that are at least limited to the atomtypes of all the unlabeled atoms - limited_atomtypes_comp = set(ext[1].atoms[ext[4][0]].atomtype).intersection(set(unlabeled_atoms_in_comp_rxns)) - ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(limited_atomtypes_comp)] - else: - #extension was performed on a labeled atom. For each labeled atom, we know all the atomtypes in the training reactions. Let's limit regularization dimensions to these known atomtypes - adjusted_index = int(ext[1].atoms[ext[4][0]].label.replace('*','')) #i.e. ext[4]= (3,), ext[4][0] = 3, ext[0].atoms[3]=, ext[0].atoms[3].label = '*5' - ext[1].atoms[ext[4][0]].reg_dim_atm = [ext[1].atoms[ext[4][0]].atomtype, list(atom_labeling_in_comp_rxns_set[adjusted_index])] - - #make sure the rest of the atoms in the extension take on the same regularization dimensions as the parent. Ensures subgraph isomorphism. - for i, parent_atm in enumerate(parent.item.atoms): - if i == ext[4][0]: - continue #this is the atom that the extension is focused on, handled above if the extension was an 'atomExt' extension type - elif parent_atm.reg_dim_atm[1]==[]: - continue #only take on regularization dimensions of parent if there is some - else: - ext[0].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the extension - if ext[1] is not None: #check if there's a complimentary node - ext[1].atoms[i].reg_dim_atm[1] = parent_atm.reg_dim_atm[1] #passing regularization info from parent to the complimentary extension - - - if ext[3] == 'elExt': + ext[0].atoms[ext[4][0]].reg_dim_atm = [ext[0].atoms[ext[4][0]].atomtype, ext[0].atoms[ext[4][0]].atomtype] + elif ext[3] == 'elExt': ext[0].atoms[ext[4][0]].reg_dim_u = [ext[0].atoms[ext[4][0]].radical_electrons, ext[0].atoms[ext[4][0]].radical_electrons] @@ -3325,7 +3253,6 @@ def extend_node(self, parent, template_rxn_map, obj=None, T=1000.0, iter_max=np. template_rxn_map[cextname] = comp_entries else: template_rxn_map[parent.label] = comp_entries - return True def generate_tree(self, rxns=None, obj=None, thermo_database=None, T=1000.0, nprocs=1, min_splitable_entry_num=2, @@ -3860,10 +3787,9 @@ def simple_regularization(self, node, template_rxn_map, test=True): self.simple_regularization(child, template_rxn_map) grp = node.item - parent = node.parent.item rxns = template_rxn_map[node.label] - R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br', 'Li'] # set of possible R elements/atoms + R = ['H', 'C', 'N', 'O', 'Si', 'S', 'Cl', 'F', 'Br'] # set of possible R elements/atoms R = [ATOMTYPES[x] for x in R] RnH = R[:] @@ -3878,15 +3804,14 @@ def simple_regularization(self, node, template_rxn_map, test=True): for i, atm1 in enumerate(grp.atoms): skip = False - if i <= len(parent.atoms)-1: #if we aren't at an atom definition that the parent node doesn't have (due to this child being an extNewBondExt type) - if node.children == [] and parent.atoms[i].reg_dim_atm[1]==[]: # if the atoms or bonds are graphically indistinguishable don't regularize - bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()} - for atm2 in grp.atoms: - if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds): - bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()} - if bdpairs == bdpairs2: - skip = True - indistinguishable.append(i) + if node.children == []: # if the atoms or bonds are graphically indistinguishable don't regularize + bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()} + for atm2 in grp.atoms: + if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds): + bdpairs2 = {(atm, tuple(bd.order)) for atm, bd in atm2.bonds.items()} + if bdpairs == bdpairs2: + skip = True + indistinguishable.append(i) if not skip and atm1.reg_dim_atm[1] != [] and set(atm1.reg_dim_atm[1]) != set(atm1.atomtype): atyp = atm1.atomtype @@ -3898,14 +3823,14 @@ def simple_regularization(self, node, template_rxn_map, test=True): vals = list(set(atyp) & set(atm1.reg_dim_atm[1])) assert vals != [], 'cannot regularize to empty' - #if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]): - if not test: - atm1.atomtype = vals - else: - oldvals = atm1.atomtype - atm1.atomtype = vals - if not self.rxns_match_node(node, rxns): - atm1.atomtype = oldvals + if all([set(child.item.atoms[i].atomtype) <= set(vals) for child in node.children]): + if not test: + atm1.atomtype = vals + else: + oldvals = atm1.atomtype + atm1.atomtype = vals + if not self.rxns_match_node(node, rxns): + atm1.atomtype = oldvals if not skip and atm1.reg_dim_u[1] != [] and set(atm1.reg_dim_u[1]) != set(atm1.radical_electrons): if len(atm1.radical_electrons) == 1: diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py index 279fbc641e..582a164f3b 100644 --- a/rmgpy/molecule/group.py +++ b/rmgpy/molecule/group.py @@ -1577,7 +1577,7 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N """ cython.declare(atoms=list, atm=GroupAtom, atm2=GroupAtom, bd=GroupBond, i=int, j=int, extents=list, RnH=list, typ=list) - + print('im in') extents = [] if r_bonds is None: r_bonds = [1, 1.5, 2, 3, 4] @@ -1690,6 +1690,7 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N elif typ[0].label == 'R!H': extents.extend(self.specify_atom_extensions(i, basename, list(set(atm.reg_dim_atm[0]) & set(r)))) else: + print(set(typ), set(atm.reg_dim_atm[0]), list(set(typ) & set(atm.reg_dim_atm[0]))) extents.extend(self.specify_atom_extensions(i, basename, list(set(typ) & set(atm.reg_dim_atm[0])))) if atm.reg_dim_u == []: if len(atm.radical_electrons) != 1: @@ -1732,6 +1733,8 @@ def specify_atom_extensions(self, i, basename, r): grps = [] Rset = set(r) + + #consider node splitting for item in r: grp = deepcopy(self) grpc = deepcopy(self) @@ -1757,6 +1760,31 @@ def specify_atom_extensions(self, i, basename, r): grps.append( (grp, grpc, basename + '_' + str(i + 1) + old_atom_type_str + '->' + item.label, 'atomExt', (i,))) + #generate an extension without node splitting + if len(self.atoms[i].atomtype)>len(Rset): + if all(r in self.atoms[i].atomtype for r in Rset): + #that means even if we update the atomtype of the atom to the Rset, it will still be a specification + grp = deepcopy(self) + grp.atoms[i].atomtype = list(Rset) + + #rename + old_atom_type = grp.atoms[i].atomtype + + if len(old_atom_type) > 1: + labelList = [] + old_atom_type_str = '' + for k in old_atom_type: + labelList.append(k.label) + for p in sorted(labelList): + old_atom_type_str += p + elif len(old_atom_type) == 0: + old_atom_type_str = "" + else: + old_atom_type_str = old_atom_type[0].label + + grps.append( + (grp, None, basename + '_' + str(i + 1) + old_atom_type_str + '->' + ''.join(r.label for r in Rset), 'atomExt', (i,))) + return grps def specify_ring_extensions(self, i, basename): From 54466605f98bc0611ca1266cc0beed7b4db55d45 Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Wed, 19 Nov 2025 16:05:42 -0500 Subject: [PATCH 6/9] handling leaf nodes. Running leaf nodes through get_extension_edge so regularization info is passed (but not actually extending them). --- rmgpy/data/kinetics/family.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index f57ce12b2e..760a7c9271 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -3801,10 +3801,17 @@ def simple_regularization(self, node, template_rxn_map, test=True): if isinstance(node.item, Group): indistinguishable = [] + + if node.children==[]: #if this is a leaf node, run it through get_extension_edge so that the regularization info is passed to leaf node + print('extending leaf nodes to get regularization info') + _, _ = self.get_extension_edge(node, template_rxn_map, obj=None, T=1000.0, iter_max=1, iter_item_cap=1) + for i, atm1 in enumerate(grp.atoms): skip = False - if node.children == []: # if the atoms or bonds are graphically indistinguishable don't regularize + if node.children == []: + + # if the atoms or bonds are graphically indistinguishable don't regularize bdpairs = {(atm, tuple(bd.order)) for atm, bd in atm1.bonds.items()} for atm2 in grp.atoms: if atm1 is not atm2 and atm1.atomtype == atm2.atomtype and len(atm1.bonds) == len(atm2.bonds): From 8ed710f274b4e35fcb0de08967a11c19056b7c4c Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Fri, 21 Nov 2025 09:30:31 -0500 Subject: [PATCH 7/9] cleaning up print statements --- rmgpy/molecule/group.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rmgpy/molecule/group.py b/rmgpy/molecule/group.py index 582a164f3b..8caa3663d2 100644 --- a/rmgpy/molecule/group.py +++ b/rmgpy/molecule/group.py @@ -1577,7 +1577,7 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N """ cython.declare(atoms=list, atm=GroupAtom, atm2=GroupAtom, bd=GroupBond, i=int, j=int, extents=list, RnH=list, typ=list) - print('im in') + extents = [] if r_bonds is None: r_bonds = [1, 1.5, 2, 3, 4] @@ -1690,7 +1690,6 @@ def get_extensions(self, r=None, r_bonds=None, r_un=None, basename='', atm_ind=N elif typ[0].label == 'R!H': extents.extend(self.specify_atom_extensions(i, basename, list(set(atm.reg_dim_atm[0]) & set(r)))) else: - print(set(typ), set(atm.reg_dim_atm[0]), list(set(typ) & set(atm.reg_dim_atm[0]))) extents.extend(self.specify_atom_extensions(i, basename, list(set(typ) & set(atm.reg_dim_atm[0])))) if atm.reg_dim_u == []: if len(atm.radical_electrons) != 1: @@ -1762,6 +1761,7 @@ def specify_atom_extensions(self, i, basename, r): #generate an extension without node splitting if len(self.atoms[i].atomtype)>len(Rset): + print('generating a non-splitting extension') if all(r in self.atoms[i].atomtype for r in Rset): #that means even if we update the atomtype of the atom to the Rset, it will still be a specification grp = deepcopy(self) From d6e469a6552cf50fda52692a4b8e34728fcbb0bd Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Mon, 1 Dec 2025 11:04:10 -0500 Subject: [PATCH 8/9] Prevents clearing of regularization dimension during each step of cascade algorithm. Allows optimization dimensions of reg_dim_atm dictionary to clear (first list in dictionary), but preserves the regularization dimension in reg_dim_atm (second list in dictionary). --- rmgpy/data/kinetics/family.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index 760a7c9271..e4841e99ed 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -3395,7 +3395,14 @@ def prune_tree(self, rxns, newrxns, thermo_database=None, new_fraction_threshold parent.children.remove(entry) del self.groups.entries[key] else: - entry.item.clear_reg_dims() + for atm in entry.item.atoms: + atm.reg_dim_atm[0] = [] #only clear the optimization dimension, preserve the regularization dimension + atm.reg_dim_u[0] = [] + atm.reg_dim_r[0] = [] + atm.reg_dim_site[0] = [] + atm.reg_dim_morphology[0] = [] + for bd in self.get_all_edges(): + bd.reg_dim[0] = [] def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, depth=0, min_splitable_entry_num=2, min_rxns_to_spawn=20, extension_iter_max=np.inf, extension_iter_item_cap=np.inf): From 50e758838cf836bb76ff4531e95e7a6dd9786f2f Mon Sep 17 00:00:00 2001 From: Nora Khalil Date: Mon, 1 Dec 2025 12:58:12 -0500 Subject: [PATCH 9/9] forgot to change 'self' to 'entry.item' --- rmgpy/data/kinetics/family.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rmgpy/data/kinetics/family.py b/rmgpy/data/kinetics/family.py index e4841e99ed..284db46cc5 100644 --- a/rmgpy/data/kinetics/family.py +++ b/rmgpy/data/kinetics/family.py @@ -3401,7 +3401,7 @@ def prune_tree(self, rxns, newrxns, thermo_database=None, new_fraction_threshold atm.reg_dim_r[0] = [] atm.reg_dim_site[0] = [] atm.reg_dim_morphology[0] = [] - for bd in self.get_all_edges(): + for bd in entry.item.get_all_edges(): bd.reg_dim[0] = [] def make_tree_nodes(self, template_rxn_map=None, obj=None, T=1000.0, nprocs=0, depth=0, min_splitable_entry_num=2,