Commits

Daniel Lowe committed 0076116

Added support for oligosaccharides

  • Participants
  • Parent commits 0d85878

Comments (0)

Files changed (15)

File opsin-core/src/main/java/uk/ac/cam/ch/wwmm/opsin/ComponentProcessor.java

 			for (Element subBracketOrRoot : substituentsAndRootAndBrackets) {
 				assignLocantsAndMultipliers(subBracketOrRoot);
 			}
+			processGlycosidicLinkgageDescriptors(substituents, brackets);
 			processWordLevelMultiplierIfApplicable(word, wordCount);
 		}
 		new WordRulesOmittedSpaceCorrector(state, parse).correctOmittedSpaces();//TODO where should this go?
 	    }
 	    likelyAtom.addChargeAndProtons(chargeChange, protonChange);
 	}
+	
+	/**
+	 * Converts a glycosidic linkage description e.g. (1->4) into an O[1-9] locant
+	 * If the carbohydrate is preceded by substituents these are placed into a bracket and the bracket locanted
+	 * @param substituents
+	 * @param brackets
+	 * @throws StructureBuildingException
+	 */
+	private void processGlycosidicLinkgageDescriptors(List<Element> substituents, List<Element> brackets) throws StructureBuildingException {
+		for (Element substituent : substituents) {
+			List<Element> carbLocants = XOMTools.getChildElementsWithTagName(substituent, CARBOHYDRATELOCANT_EL);
+			if (carbLocants.size() > 0){
+				if (carbLocants.size() > 1){
+					throw new RuntimeException("OPSIN Bug: More than 1 glycosidic linkage locant associated with subsituted");
+				}
+				Element group = substituent.getFirstChildElement(GROUP_EL);
+				Fragment carbFrag = state.xmlFragmentMap.get(group);
+				String carbLocantStr = carbLocants.get(0).getValue();
+				String locantAnomeric = carbLocantStr.substring(1,2);
+				String locantToConnectTo = carbLocantStr.substring(4,5);
+				Atom anomericAtom = carbFrag.getAtomByLocantOrThrow(locantAnomeric);
+				boolean anomericIsOutAtom = false;
+				for (int i = 0; i < carbFrag.getOutAtomCount(); i++) {
+					if (carbFrag.getOutAtom(i).getAtom().equals(anomericAtom)){
+						anomericIsOutAtom = true;
+					}
+				}
+				if (!anomericIsOutAtom){
+					throw new StructureBuildingException("Invalid glycoside linkage descriptor. Locant: " + locantAnomeric + " should point to the anomeric carbon");
+				}
+				
+				if (OpsinTools.getNextGroup(group)==null){
+					throw new StructureBuildingException("Glycoside linkage descriptor should be followed by a carbohydrate: " + carbLocantStr);
+				}
+				Element parent = (Element) substituent.getParent();
+				Attribute locantAtr = new Attribute(LOCANT_ATR, "O" + locantToConnectTo);
+
+				Element elementAfterSubstituent = (Element) XOMTools.getNextSibling(substituent);				
+				boolean hasAdjacentGroupToSubstitute = (elementAfterSubstituent !=null &&
+						(elementAfterSubstituent.getLocalName().equals(SUBSTITUENT_EL) ||
+						elementAfterSubstituent.getLocalName().equals(BRACKET_EL) ||
+						elementAfterSubstituent.getLocalName().equals(ROOT_EL)));
+				
+
+				/* If a carbohydrate is not at the end of a scope but is preceded by substituents/brackets
+				 * these are bracketted and the locant assigned to the bracket.
+				 * Else If the group is the only thing in a bracket the locant is assigned to the bracket (this is used to describe branches)
+				 * Else the locant is assigned to the substituent
+				 */
+				boolean bracketAdded =false;
+				if (hasAdjacentGroupToSubstitute){
+					//now find the brackets/substituents before this element
+					Element previous = (Element) XOMTools.getPreviousSibling(substituent);
+					List<Element> previousElements = new ArrayList<Element>();
+					while( previous !=null){
+						if (!previous.getLocalName().equals(SUBSTITUENT_EL) && !previous.getLocalName().equals(BRACKET_EL)){
+							break;
+						}
+						previousElements.add(previous);
+						previous = (Element) XOMTools.getPreviousSibling(previous);
+					}
+					if (previousElements.size() > 0 ){//an explicit bracket is needed
+						Collections.reverse(previousElements);
+						Element bracket = new Element(BRACKET_EL);
+						bracket.addAttribute(locantAtr);
+						int indexToInsertAt = parent.indexOf(previousElements.get(0));
+						for (Element element : previousElements) {
+							element.detach();
+							bracket.appendChild(element);
+						}
+
+						substituent.detach();
+						bracket.appendChild(substituent);
+						parent.insertChild(bracket, indexToInsertAt);
+						brackets.add(bracket);
+						bracketAdded = true;
+					}
+				}
+				
+				if (!bracketAdded) {
+					Element elToAddAtrTo;
+					if (parent.getLocalName().equals(BRACKET_EL) && !hasAdjacentGroupToSubstitute){
+						elToAddAtrTo = parent;
+					}
+					else{
+						elToAddAtrTo = substituent;
+					}
+					if (elToAddAtrTo.getAttribute(LOCANT_ATR) !=null){
+						throw new StructureBuildingException("Carbohydrate with glycoside linkage descriptor should not also have a locant: " + elToAddAtrTo.getAttributeValue(LOCANT_ATR));
+					}
+					elToAddAtrTo.addAttribute(locantAtr);
+				}
+				carbLocants.get(0).detach();
+			}
+		}
+	}
 
 	/**
 	 * Moves a multiplier out of a bracket if the bracket contains only one substituent

File opsin-core/src/main/java/uk/ac/cam/ch/wwmm/opsin/StringTools.java

             case '\u00B1': return "+-";//plus minus symbol
             case '\u2213': return "-+";
             
+            case '\u2192': return "->";//right arrows
+            case '\u2794': return "->";
+            case '\u2799': return "->";
+            case '\u279C': return "->";
+            
             case '\u00C6': return "AE";//common ligatures
             case '\u00E6': return "ae";
             case '\u0152': return "OE";

File opsin-core/src/main/java/uk/ac/cam/ch/wwmm/opsin/XmlDeclarations.java

 	/**An O that indicates that the preceding alkaneStem is in fact a bridge*/
 	static final String BRIDGEFORMINGO_EL ="bridgeFormingO";
 
+	/**A locant indicating the positions for a glycosidic linkage. The first locant will point to an alpha carbon*/
+	static final String CARBOHYDRATELOCANT_EL ="carbohydrateLocant";	
+
 	/**Indicates the size of the ring in a carbohydrate e.g. furanose = 5*/
 	static final String CARBOHYDRATERINGSIZE_EL ="carbohydrateRingSize";
 

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/regexTokens.xml

   <regexToken regex="[isn]-" symbol="n" tagname="alkaneStemModifier" determinise="yes"/><!--case sensitive so must be a regex-->
   <regexToken regex="([oO][rR][tT][hH][oO]|[mM][eE][tT][aA]|[pP][aA][rR][aA])-?|(o|[mM]|[p])-" symbol="O" tagname="orthoMetaPara" determinise="yes"/><!-- O- and P- means oxygen and phosphorus locant-->
   <regexToken regex="%locantTypes%,%locantTypes%(:%locantTypes%,%locantTypes%)*(:%locantTypes%,(%locantTypes%-|%locantTypesOptionalHyphen%-?))" symbol="L" tagname="colonSeperatedLocant" determinise="yes"/>
+  <regexToken regex="%openBracket%[1-9]->[1-9]%closeBracket%" symbol="ì" tagname="carbohydrateLocant" determinise="yes"/>
+
   <regexToken regex="[EZz]" symbol="æ" type="EorZ" tagname="stereoChemistry" determinise="yes" /><!--unbracketted E or Z for stereochemistry, e intentionally disabled to avoid ambiguity with the end of tokens where e is optional-->
   <regex name="%alphaBetaLocant%" regex="([1-9][0-9]?([abx]|[aA][lL][pP][hH][aA]|[bB][eE][tT][aA]|[xX][iI]))"/>
   <regexToken regex="((%alphaBetaLocant%,|%locantTypes%,)*(%alphaBetaLocant%,(%alphaBetaLocant%,|%locantTypes%,)*(%alphaBetaLocant%-?|%locantTypes%-|%locantTypesOptionalHyphen%-?)|%alphaBetaLocant%-?))" symbol="¹" type="alphaOrBeta" tagname="stereoChemistry" determinise="yes"/>

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/regexes.xml

   <regex name="%endOfSubstituent%" value="é"/>
 	<regex name="%carbohydrateUnlocantedRootSuffix%" value="ê"/>
   <regex name="%carbohydrateLocantedRootSuffix%" value="ë"/>
-  <!--next would be Alt 0236-->
+  <regex name="%carbohydrateLocant%" value="ì"/>
+  <!--next would be Alt 0237-->
   <regex name="%endOfFunctionalGroup%" value="û"/>
 
 <!-- composite regexes-->
   <regex name="%conjunctiveSideChainInline%" value ="(%multiplier%?(%acidStem%(%infixedInlineSuffix%|%inlineSuffix%|%o%%infixableInlineSuffix%)))"/><!--No hyphen/locant to avoid ambiguity with a mixture e.g. benzene-acetylbenzene-->
 
   <regex name="%cyclicSugar%" value="(%dlStereochemistryPrefix%?%carbohydrateStem%%hyphen%?%carbohydrateRingSize%%e%?(%hyphen%?%carbohydrateUnlocantedRootSuffix%)?(%optLocantGroupNoOMP%%carbohydrateLocantedRootSuffix%)?)"/>
-  <regex name="%cyclicSugarSubstituent%" value="(%cyclicSugar%%hyphen%?(%carbohydrateInlineSuffix%|%locant%%locantedAminoAcidOrCarbohydrateYl%))"/>
+  <regex name="%cyclicSugarSubstituent%" value="(%cyclicSugar%%hyphen%?(%carbohydrateInlineSuffix%(%hyphen%?%carbohydrateLocant%)?|%locant%%locantedAminoAcidOrCarbohydrateYl%))"/>
 
   <!--Fused ring terms-->
   <regex name="%benzoComponent%" value ="((%bracketedLocant%|%locant%)%benzo%)"/>

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/carbohydrateLocant_236RegexHash.txt

+-2006080166

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/carbohydrateLocant_236SerialisedAutomaton.aut

Binary file added.

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/carbohydrateLocant_236_reversed_RegexHash.txt

+-2006080166

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/carbohydrateLocant_236_reversed_SerialisedAutomaton.aut

Binary file added.

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemicalRegexHash.txt

--549724911
+172289192

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemicalSerialisedAutomaton.aut

Binary file modified.

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemical_reversed_RegexHash.txt

--549724911
+172289192

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/serialisedAutomata/chemical_reversed_SerialisedAutomaton.aut

Binary file modified.

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/tokenList.dtd

 <!-- See tokenLists.dtd for further explanation of these terms -->
 <!ELEMENT tokenList (token)+>
-<!ATTLIST tokenList tagname (alkaneStemComponent|alkaneStemModifier|bridgeFormingO|carbohydrateRingSize|chargeSpecifier|closebracket|cyclo|dlStereochemistry|functionalClass|functionalGroup|functionalModifier|fusedRingBridge|group|heteroatom|hydro|hyphen|ine|infix|hydrocarbonFusedRingSystem|multiplier|NA|openbracket|oxidationNumberSpecifier|polyCyclicSpiro|ringAssemblyMultiplier|stereoChemistry|structuralCloseBracket|structuralOpenBracket|subtractivePrefix|suffix|suffixPrefix|unsaturator) #REQUIRED
+<!ATTLIST tokenList tagname (alkaneStemComponent|alkaneStemModifier|bridgeFormingO|carbohydrateLocant|carbohydrateRingSize|chargeSpecifier|closebracket|cyclo|dlStereochemistry|functionalClass|functionalGroup|functionalModifier|fusedRingBridge|group|heteroatom|hydro|hyphen|ine|infix|hydrocarbonFusedRingSystem|multiplier|NA|openbracket|oxidationNumberSpecifier|polyCyclicSpiro|ringAssemblyMultiplier|stereoChemistry|structuralCloseBracket|structuralOpenBracket|subtractivePrefix|suffix|suffixPrefix|unsaturator) #REQUIRED
                     type (acidStem|aminoAcid|basic|carbohydrateChainLength|carbohydrateConfigurationalPrefix|carbohydrateStem|chain|charge|cisOrTrans|chalcogenAcidStem|diValentGroup|EorZ|functionalClass|group|inline|monoValentGroup|monoValentStandaloneGroup|nonCarboxylicAcid|ring|simpleGroup|substituent|root|VonBaeyer) #IMPLIED
                     subType (alkaneStem|anhydride|arylGroup|arylSubstituent|biochemical|carbohydrate|cycleformer|cyclicUnsaturableHydrocarbon|dedicatedFunctionalReplacementPrefix|elementaryAtom|endInAn|endInIc|endInIne|epoxyLike|functionalClassGroup|groupStem|halideOrPseudoHalide|hantzschWidman|heteroStem|fusionRing|simpleGroup|multiRadicalSubstituent|noAcyl|none|oxidoLike|perhalogeno|phospho|simpleSubstituent|substituent|terminal|ylForAcyl|ylForNothing|ylForYl) #IMPLIED
                     symbol CDATA #REQUIRED

File opsin-core/src/main/resources/uk/ac/cam/ch/wwmm/opsin/resources/tokenLists.dtd

 <!-- A list of tokens -->
 <!ELEMENT tokenLists (tokenList)+>
 <!ELEMENT tokenList (token)+>
-<!ATTLIST tokenList tagname (alkaneStemComponent|alkaneStemModifier|bridgeFormingO|carbohydrateRingSize|chargeSpecifier|closebracket|cyclo|dlStereochemistry|functionalClass|functionalGroup|functionalModifier|fusedRingBridge|group|heteroatom|hydro|hyphen|ine|infix|hydrocarbonFusedRingSystem|multiplier|NA|openbracket|oxidationNumberSpecifier|polyCyclicSpiro|ringAssemblyMultiplier|stereoChemistry|structuralCloseBracket|structuralOpenBracket|subtractivePrefix|suffix|suffixPrefix|unsaturator) #REQUIRED
+<!ATTLIST tokenList tagname (alkaneStemComponent|alkaneStemModifier|bridgeFormingO|carbohydrateLocant|carbohydrateRingSize|chargeSpecifier|closebracket|cyclo|dlStereochemistry|functionalClass|functionalGroup|functionalModifier|fusedRingBridge|group|heteroatom|hydro|hyphen|ine|infix|hydrocarbonFusedRingSystem|multiplier|NA|openbracket|oxidationNumberSpecifier|polyCyclicSpiro|ringAssemblyMultiplier|stereoChemistry|structuralCloseBracket|structuralOpenBracket|subtractivePrefix|suffix|suffixPrefix|unsaturator) #REQUIRED
                     type (acidStem|aminoAcid|basic|carbohydrateChainLength|carbohydrateConfigurationalPrefix|carbohydrateStem|chain|charge|cisOrTrans|chalcogenAcidStem|diValentGroup|EorZ|functionalClass|group|inline|monoValentGroup|monoValentStandaloneGroup|nonCarboxylicAcid|ring|simpleGroup|substituent|root|VonBaeyer) #IMPLIED
                     subType (alkaneStem|anhydride|arylGroup|arylSubstituent|biochemical|carbohydrate|cycleformer|cyclicUnsaturableHydrocarbon|dedicatedFunctionalReplacementPrefix|elementaryAtom|endInAn|endInIc|endInIne|epoxyLike|functionalClassGroup|groupStem|halideOrPseudoHalide|hantzschWidman|heteroStem|fusionRing|simpleGroup|multiRadicalSubstituent|noAcyl|none|oxidoLike|perhalogeno|phospho|simpleSubstituent|substituent|terminal|ylForAcyl|ylForNothing|ylForYl) #IMPLIED
                     symbol CDATA #REQUIRED