Commits

Daniel Lowe committed 30bdc14

Added support for the addition of one bridge e.g. 4,7-methanoindene
Added a few retained trivial terpenoid substituents and set defaultInIDs so they chain in the expected manner

  • Participants
  • Parent commits ecb661b

Comments (0)

Files changed (13)

File core/src/main/java/uk/ac/cam/ch/wwmm/opsin/PostProcessor.java

 		
 		processHydroCarbonRings(moleculeEl);
 		for (Element group : groups) {
+			detectAlkaneFusedRingBridges(group);
 			processRings(group);//processes cyclo, von baeyer and spiro tokens
 			handleGroupIrregularities(group);//handles benzyl, diethylene glycol, phenanthrone and other awkward bits of nomenclature
 		}
 			}
 		}
 	}
+	
+	/**
+	 * Looks for alkaneStems followed by a bridge forming 'o' and makes them fused ring bridge elements
+	 * @param group
+	 */
+	private void detectAlkaneFusedRingBridges(Element group) {
+		if (ALKANESTEM_SUBTYPE_VAL.equals(group.getAttributeValue(SUBTYPE_ATR))){
+			Element possibleBridgeFormer = (Element) XOMTools.getNextSiblingIgnoringCertainElements(group, new String[]{UNSATURATOR_EL});
+			if(possibleBridgeFormer!=null && possibleBridgeFormer.getLocalName().equals(BRIDGEFORMINGO_EL)){
+				possibleBridgeFormer.detach();
+				group.setLocalName(FUSEDRINGBRIDGE_EL);
+			}
+		}
+	}
 
 	/**Looks (multiplier)cyclo/spiro/cyclo tags before chain
 	 * and replaces them with a group with appropriate SMILES

File core/src/main/java/uk/ac/cam/ch/wwmm/opsin/PreStructureBuilder.java

 			for (Element subOrRoot : substituentsAndRoot) {
 				processHW(state, subOrRoot);//hantzch-widman rings
 				FusedRingBuilder.processFusedRings(state, subOrRoot);
+				processFusedRingBridges(state, subOrRoot);
 				assignElementSymbolLocants(state, subOrRoot);
 				processRingAssemblies(state, subOrRoot);
 				processPolyCyclicSpiroNomenclature(state, subOrRoot);
 		if(currentElem != null && currentElem.getLocalName().equals(POLYCYCLICSPIRO_EL)){
 			return true;
 		}
+		if(currentElem != null && count==2 && currentElem.getLocalName().equals(FUSEDRINGBRIDGE_EL)){
+			return true;
+		}
 		boolean detectedMultiplicativeNomenclature = detectMultiplicativeNomenclature(locant, locantValues, finalSubOrRootInWord);
 		if (detectedMultiplicativeNomenclature){
 			return true;
 
 
 	/**
+	 * Processes bridges e.g. 4,7-methanoindene
+	 * Resolves and attaches said bridges to the adjacent ring fragment
+	 * @param state
+	 * @param subOrRoot
+	 * @throws StructureBuildingException 
+	 */
+	private void processFusedRingBridges(BuildState state, Element subOrRoot) throws StructureBuildingException {
+		List<Element> bridges = XOMTools.getChildElementsWithTagName(subOrRoot, FUSEDRINGBRIDGE_EL);
+		for (Element bridge : bridges) {
+			Fragment ringFrag = state.xmlFragmentMap.get(XOMTools.getNextSibling(bridge, GROUP_EL));
+			Fragment bridgeFrag =state.fragManager.buildSMILES(bridge.getAttributeValue(VALUE_ATR), ringFrag.getType(), ringFrag.getSubType(), NONE_LABELS_VAL);//TODO label bridges
+
+			List<Atom> bridgeAtomList =bridgeFrag.getAtomList();
+			bridgeFrag.addOutAtom(bridgeAtomList.get(0), 1, true);
+			bridgeFrag.addOutAtom(bridgeAtomList.get(bridgeAtomList.size()-1), 1, true);
+			Element possibleLocant = (Element) XOMTools.getPreviousSibling(bridge);
+			if (possibleLocant !=null && possibleLocant.getLocalName().equals(LOCANT_EL)){
+				String[] locantArray = matchComma.split(possibleLocant.getValue());
+				if (locantArray.length==2){
+					bridgeFrag.getOutAtom(0).setLocant(locantArray[0]);
+					bridgeFrag.getOutAtom(1).setLocant(locantArray[1]);
+					possibleLocant.detach();
+				}
+			}
+			StructureBuildingMethods.formEpoxide(state, bridgeFrag, ringFrag.getDefaultInAtom());
+			state.fragManager.incorporateFragment(bridgeFrag, ringFrag);
+			bridge.detach();
+		}
+	}
+
+
+	/**
 	 * Searches for lambdaConvention elements and applies the valency they specify to the atom
 	 * they specify on the substituent/root's fragment
 	 * @param state

File core/src/main/java/uk/ac/cam/ch/wwmm/opsin/StructureBuildingMethods.java

 		if (state.debug){System.out.println("Substitutively bonded " + from.getID() + " (" +state.xmlFragmentMap.getElement(from.getFrag()).getValue()+") " + atomToJoinTo.getID() + " (" +state.xmlFragmentMap.getElement(atomToJoinTo.getFrag()).getValue()+")");}
 	}
 
-	private static void formEpoxide(BuildState state, Fragment fragToBeJoined, Atom atomToJoinTo) throws StructureBuildingException {
+	static void formEpoxide(BuildState state, Fragment fragToBeJoined, Atom atomToJoinTo) throws StructureBuildingException {
 		Fragment fragToJoinTo = atomToJoinTo.getFrag();
 		List<Atom> atomList = fragToJoinTo.getAtomList();
 		Atom firstAtomToJoinTo;

File core/src/main/java/uk/ac/cam/ch/wwmm/opsin/XmlDeclarations.java

 
 	/**An annulene. Converted to a group by the PostProcessor*/
 	static final String ANNULEN_EL ="annulen";
+	
+	/**A bridge described in SMILES for used on rings*/
+	static final String FUSEDRINGBRIDGE_EL ="fusedRingBridge";
+	
+	/**An O that indicates that the preceding alkaneStem is in fact a bridge*/
+	static final String BRIDGEFORMINGO_EL ="bridgeFormingO";
 
 	/**A charge specifier e.g. (2+). Value is the charge to set something to*/
 	static final String CHARGESPECIFIER_EL ="chargeSpecifier";

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/regexTokens.xml

   <regexToken regex="%indicatedHydrogen%(,%indicatedHydrogen%)*-" symbol="e" tagname="hydrogen" determinise="yes"/>
   <regexToken regex="[eE]" symbol="Z" tagname="e" ignoreWhenWritingXML="yes" determinise="yes"/>
   <regexToken regex="[oO]" symbol="Y" tagname="o" ignoreWhenWritingXML="yes" determinise="yes"/>
+  <regexToken regex="[oO]" symbol="╡" tagname="bridgeFormingO" determinise="yes"/>
   <regexToken regex="[aA]" symbol="¬" tagname="a" ignoreWhenWritingXML="yes" determinise="yes"/>
   <regexToken regex="," symbol="ç" tagname="comma" ignoreWhenWritingXML="yes" determinise="yes"/>
   <regexToken regex="[isn]-" symbol="n" tagname="alkaneStemModifier" determinise="yes"/><!--case sensitive so must be a regex-->

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/regexes.xml

   <regex name="%alkaneStemTens%" value="▓"/>
   <regex name="%alkaneStemHundreds%" value="│"/>
   <regex name="%alkaneStemThousands%" value="┤"/>
+  <regex name="%bridgeFormingO%" value="╡"/>
   <!--up to ascii 180 used-->
 
 <!-- composite regexes-->
   <regex name="%suffixGroup%" value="((%relativeCisTrans%|%optLocantGroup%)?%unlocantedSuffixGroup%)"/>
   <regex name="%rootEnding%" value="(%inlineChargeGroup%*%suffixGroup%?)"/>
   <regex name="%locantOpenBracket%" value="(%stereochemistry%*%newLocantGroupNoStartingHyphen%{0,2}%stereochemistry%*%multiplier%?%openBracket%%hyphen%?)"/>
-  <regex name="%ring_CanStartWithHydro_NonDetachableFeatures%" value="((%heteroReplacement%|%hydroGroup%)*(%bigCapitalH%?%lambdaConvention%?|%lambdaConvention%?%bigCapitalH%?)%heteroReplacement%*)"/><!--bigCapitalH before lambdaConvention is preferred. Heteroatomreplacement should not really be after indicated hydrogen-->
-  <regex name="%ringNonDetachableFeatures%" value="(%heteroReplacement%*(%bigCapitalH%?%lambdaConvention%?|%lambdaConvention%?%bigCapitalH%?)%heteroReplacement%*%newLocantGroupNoStartingHyphen%?)"/><!--last locant is an indirect locant for suffixes -->
+  <regex name="%alkaneStem%" value="((%alkaneStemUnits%%a%?)?(%alkaneStemTens%%a%?)?(%alkaneStemHundreds%%a%?)?%alkaneStemThousands%|(%alkaneStemUnits%%a%?)?(%alkaneStemTens%%a%?)?%alkaneStemHundreds%|(%alkaneStemUnits%%a%?)?%alkaneStemTens%|%alkaneStemTrivial%)"/>
+  <regex name="%fusedRingBridges%" value ="(%newLocantGroupNoStartingHyphen%?%alkaneStem%%ane%%bridgeFormingO%)"/>
+  <regex name="%ring_CanStartWithHydro_NonDetachableFeatures%" value="((%heteroReplacement%|%hydroGroup%)*(%bigCapitalH%?%lambdaConvention%?|%lambdaConvention%?%bigCapitalH%?)%heteroReplacement%*%fusedRingBridges%?)"/><!--bigCapitalH before lambdaConvention is preferred. Heteroatomreplacement should not really be after indicated hydrogen-->
+  <regex name="%ringNonDetachableFeatures%" value="(%heteroReplacement%*(%bigCapitalH%?%lambdaConvention%?|%lambdaConvention%?%bigCapitalH%?)%heteroReplacement%*%newLocantGroupNoStartingHyphen%?%fusedRingBridges%?)"/><!--last locant is an indirect locant for suffixes -->
   <regex name="%acyclicNonDetachableFeatures%" value="(%heteroReplacement%*%lambdaConvention%?%newLocantGroupNoStartingHyphen%?)"/><!--last locant is an indirect locant for suffixes-->
 
 
 <!-- groups-->
   <!--acyclic-->
-  <regex name="%alkaneStem%" value="((%alkaneStemUnits%%a%?)?(%alkaneStemTens%%a%?)?(%alkaneStemHundreds%%a%?)?%alkaneStemThousands%|(%alkaneStemUnits%%a%?)?(%alkaneStemTens%%a%?)?%alkaneStemHundreds%|(%alkaneStemUnits%%a%?)?%alkaneStemTens%|%alkaneStemTrivial%)"/>
 
   <regex name="%heteroChain%" value = "((%multiplier%%heteroStem%%unsaturationBlock%)|(%heteroStem%%hyphen%?%ane%))"/>
   <regex name="%alternatingHeteroChain%" value = "(%multiplier%(%heteroAtom%|%heteroAtomaElided%){2,}%unsaturationBlock%)"/>

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/bridgeFormingO_9569RegexHash.txt

+2820194

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/bridgeFormingO_9569SerialisedAutomaton.txt

Binary file added.

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemicalRegexHash.txt

--89114340
+1089819930

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemicalSerialisedAutomaton.txt

Binary file modified.

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/simpleGroups.xml

 		<token value="O[N+]#[C-]" labels="none" valType="SMILES" functionalIDs="1">fulminic</token>
 		<token value="O[N+]#[C-]" labels="none" valType="SMILES" functionalIDs="1">fulminicacid</token>
 		<token value="O[N+]#[C-]" labels="none" valType="SMILES" functionalIDs="1">fulminic acid</token>
-		<token value="CC(CCC=C(C)C)=CCO" labels="none" valType="SMILES">geraniol</token>
+		<token value="CC(CCC=C(C)C)=CCO" labels="none" valType="SMILES" defaultInID="7">geraniol</token>
 		<token value="C(O)C(=O)CO" labels="1//2//3/" valType="SMILES">glycerone</token>
 		<token value="C(OC)COC" labels="1///2//" valType="SMILES">glyme</token>
 		<token value="C(=O)C(=O)" labels="1//2/" valType="SMILES">glyoxal</token>
 	<!--Maybe this idea should be rethought as this way of special casing things is a bit tacky -->
 		<token value="C(=O)N" labels="1//N" valType="SMILES" defaultInID="3" functionalIDs="3,3">formamide</token>
 		<token value="C(=O)N" labels="1//N" valType="SMILES" defaultInID="3" functionalIDs="3,3">methanamide</token>
+		<token value="CC(CCOc1ccc2Nc3ccccc3Nc2c1)CC\C=C(/C)CC\C=C(/C)CC\C=C(/C)CCC=C(C)C" labels="none" valType="SMILES">dihydromethanophenazine</token><!--preferred to systematic interpretation-->
 		<token value="C(OC)C(OC)" labels="1///2//" valType="SMILES">dimethoxyethane</token>
+		<token value="CC(CCOc1ccc2nc3ccccc3nc2c1)CC\C=C(/C)CC\C=C(/C)CC\C=C(/C)CCC=C(C)C" labels="none" valType="SMILES">methanophenazine</token><!--preferred to systematic interpretation-->
 	</tokenList>
 
 	<tokenList tagname="group" type="simpleGroup" subType="biochemical" symbol="G">

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/simpleSubstituents.xml

 		<token value="-NO" labels="none" valType="SMILES">hydroxamino</token>
 		<token value="=NO" labels="none" valType="SMILES">hydroximino</token>
 		<token value="-OC(C)C" labels="none" valType="SMILES">isopropoxyl</token>
+		<token value="-C(C)(C=C)CCC=C(C)C" labels="none" valType="SMILES">linalyl</token>
 		<token value="-OCCC" labels="/1/2/3" valType="SMILES" usableAsAJoiner="yes" defaultInLocant="3">propoxyl</token>
 		<token value="-N[N+](=O)[O-]" labels="none" valType="SMILES">nitramido</token>
 		<token value="-N=O" labels="none" valType="SMILES">nitrosyl</token>

File core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/substituents.xml

   <token value="[SnH4]" labels="none" valType="SMILES" usableAsAJoiner="yes">stann</token>
   <token value="[PbH4]" labels="none" valType="SMILES" usableAsAJoiner="yes">plumb</token>
 
+  <token value="C/C=C(C)/CCC=C(C)C" defaultInID="9" usableAsAJoiner="yes" labels="none" valType="SMILES">geran</token>
   <token value="CCc1cc2OCOc2cc1" labels="none" valType="SMILES">homopiperon</token>
+  <token value="C\C=C(C)/CCC=C(C)C" defaultInID="9" usableAsAJoiner="yes" labels="none" valType="SMILES">ner</token>
+  <token value="C/C=C(C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C" labels="none" valType="SMILES">phyt</token>
   <token value="CC(=O)c1ccccc1" labels="///1/2,ortho/3,meta/4,para/5/6" valType="SMILES">phenac</token>
   <token value="Cc1cc2OCOc2cc1" labels="none" valType="SMILES">piperon</token>
   <token value="CC=C" valType="SMILES" usableAsAJoiner="yes">all</token>