Commits

lh359 committed 5a0c792 Merge

changed tokenisers and taggers to accept List<String> instead of POSContainer

Comments (0)

Files changed (17)

src/main/antlr3/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunker.g

 molecule
 	:  moleculeamount-> ^(MOLECULE  moleculeamount );
 
-moleculeamount : moleculeamount3|moleculeamount1 | moleculeamount2 ;
+moleculeamount : (moleculeamount3| moleculeamount1 | moleculeamount2) asAstate? ;
 
 moleculeamount3
 	:(quantity|mixture) inof (dtTHE | dt)? mixtureRatio mixture? oscarCompound ;
 afterCompoundCitationOrQuantity: (citation|quantity|comma (quantity1Node|citationStructure)|mixture)*;
 
 unnamedmolecule
-	: (unnamedmoleculeamount|referenceToCompound) -> ^(UNNAMEDMOLECULE unnamedmoleculeamount? referenceToCompound?);
+	: (unnamedmoleculeamount|referenceToCompound) asAstate? -> ^(UNNAMEDMOLECULE unnamedmoleculeamount? referenceToCompound? asAstate?);
 
 unnamedmoleculeamount
 	:(unnamedmoleculeamount1|unnamedmoleculeamount2|unnamedmoleculeamount3|unnamedmoleculeamount4|unnamedmoleculeamount5|unnamedmoleculeamount6) ;
 unnamedmoleculeamount6
 	:(quantity|mixture) nnchementity;
 
+asAstate
+	: inas dt (jj|jjchem)* nnstate quantity*;
+
 referenceToCompound
 	: (nnchementity | {numberLooksLikeAReferenceToACompound(input)}?) numericOrIdentifierCompoundReference;
 

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/ChemicalTaggerTokeniser.java

 
 package uk.ac.cam.ch.wwmm.chemicaltagger;
 
+import java.util.List;
+
 /**************************************
  * An interface for tokenisers.
  * @author lh359
      * @param posContainer (POSContainer)
      * @return posContainer  (POSContainer)
      */
-	POSContainer tokenise(POSContainer posContainer) ;
+	List<String> tokenise(String inputSentence) ;
 	
 }

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/ChemistryPOSTagger.java

 		
 		POSContainer posContainer = new POSContainer();
 		List<String> ignoredTags = new ArrayList<String>();
-		posContainer = normaliseAndTokeniseInput(inputSentence, posContainer, useSpectraTagger);		
+		List<String> wordTokenList = normaliseAndTokeniseInput(inputSentence, posContainer, useSpectraTagger);		
+		posContainer.setWordTokenList(wordTokenList);
+		
 		for (Tagger tagger : taggersOrderedInDescendingPriority){
-			tagger.runTagger(posContainer);
+			List<String> tagList = tagger.runTagger(wordTokenList,posContainer.getInputText());
+			posContainer.registerTagList(tagList);
+
 			if (tagger.getIgnoredTags() != null)
 		       	ignoredTags.addAll(tagger.getIgnoredTags());
 		}
 	 * @param useSpectraTagger (boolean)
 	 * @return posContainer (POSContainer)
 	 */
-	private POSContainer normaliseAndTokeniseInput(String inputSentence, POSContainer posContainer, boolean useSpectraTagger) {
+	private List<String> normaliseAndTokeniseInput(String inputSentence, POSContainer posContainer, boolean useSpectraTagger) {
 		inputSentence = Formatter.normaliseText(inputSentence);
 		posContainer.setInputText(inputSentence);
+
 		if (useSpectraTagger){
 		    posContainer = SpectraTagger.runTagger(posContainer);
 		}
-		posContainer = ctTokeniser.tokenise(posContainer);
-		return posContainer;
+		List<String> wordTokenList = ctTokeniser.tokenise(inputSentence);
+
+		return wordTokenList;
 	}
 		
 		

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/OpenNLPTagger.java

 	}
 
 	/*****************************************************
-	 * Runs the OpenNLP POS tagger against the text and stores the tags in
-	 * POSContainer.
-	 * @param posContainer (POSContainer)
-	 * @return posContainer (POSContainer) 
+	 * Runs the OpenNLP POS tagger against a list of tokens and returns a list of tags
+	 * @param tokenList (List<String>)
+	 * @return tagList (List<String>) 
 	 *****************************************************/
-	public List<String> runTagger(POSContainer posContainer) {
-		List<String> tokenList = posContainer.getWordTokenList();
+	public List<String> runTagger(List<String> tokenList, String inputSentence) {
 		String[] tokens = tokenList.toArray(new String[tokenList.size()]);
 		String[] tags = posTagger.tag(tokens);
 		List<String> tagList = createPosTagListFromStringArray(tags);
-		posContainer.registerTagList(tagList);
 		return tagList;
 	}
 

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/OscarTagger.java

 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.LinkedList;
 import java.util.List;
 
+import org.apache.commons.lang.StringUtils;
+
 import uk.ac.cam.ch.wwmm.oscar.Oscar;
 import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
 import uk.ac.cam.ch.wwmm.oscar.document.Token;
+import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
 
 /*****************************************************
  * Runs the OSCAR tagger .
 	}
 
 	/***********************************************
-	 * Runs OSCAR over a list of tokens.
-	 * 
-	 * @param posContainer  (POSContainer)
-	 * @return posContainer (POSContainer)
+	 * Runs OSCAR over a list of tokens and returns a list of tags
+	 * @param tokenList (List<String>)
+	 * @return tagList (List<String>)
 	 ***********************************************/
-	public List<String> runTagger(POSContainer posContainer) {
+	public List<String> runTagger(List<String> tokenList, String inputSentence) {
 
-		List<NamedEntity> neList = oscar.recogniseNamedEntities(posContainer.getTokenSequenceList());
+		List<TokenSequence> tokenSequenceList = convertToOscarTokenSequences(tokenList, StringUtils.join(tokenList.iterator()," "));		
+		
+		List<NamedEntity> neList = oscar.recogniseNamedEntities(tokenSequenceList);
         List<String> ignoreOscarList = Arrays.asList("cpr");
-		List<String> tokenList = posContainer.getWordTokenList();
-		List<String> oscarList = new ArrayList<String>();
+		List<String> tagList = new ArrayList<String>();
 		String tag = "nil";
 		for (int i = 0; i < tokenList.size(); i++) {
-			oscarList.add(tag);
+			tagList.add(tag);
 		}
 		for (NamedEntity ne : neList) {
 			if (!ignoreOscarList.contains(ne.getType().getName().toLowerCase())) {
                  
 				for (Token token : tokens) {
 					if (tokenList.get(token.getIndex()).contains(token.getSurface())) {
-						oscarList.set(token.getIndex(), "OSCAR-"+ne.getType().getName());
+						tagList.set(token.getIndex(), "OSCAR-"+ne.getType().getName());
 					}
 				}
 			}
 		}
-		posContainer.registerTagList(oscarList);
-		return oscarList;
+		return tagList;
 	}
 	public List<String> getIgnoredTags() {
 		return null;
 	}
+	
+	/*********************************************
+	 * Converts a list of words into a list of Oscar TokenSequences.
+	 * @param wordTokenList (List<String>)
+	 * @param inputText (String)
+	 * @return tokenSequenceList (List<TokenSequence>)
+	 ********************************************/
+	private  List<TokenSequence> convertToOscarTokenSequences(List<String> wordTokenList, String inputText) {
+		List<Token> oscarTokens = convertWordlistToOscarTokens(wordTokenList);
+		List<TokenSequence> tokenSequenceList = makeTokenSequences(inputText, oscarTokens);
+		return tokenSequenceList;
+	}
+
+
+	/*************************************************
+	 * Converts a list of words into a list of Oscar Tokens.
+	 * @param wordTokenList (List<String>)
+	 * @return oscarTokens (List<Token>)
+	 ***********************************************/
+	private List<Token> convertWordlistToOscarTokens(List<String> wordTokenList) {
+
+		int index = 0;
+		int sentenceIndex = 0;
+		List<Token> oscarTokens = new LinkedList<Token>();
+		boolean endFlag = true;
+
+		for (String word : wordTokenList)  {
+			int startIndex = sentenceIndex;
+			int endIndex = sentenceIndex+word.length();
+			Token oscarToken = new Token(word, startIndex, endIndex, null, null, null);
+			
+			
+			oscarToken.setIndex(index);
+			oscarTokens.add(oscarToken);
+			sentenceIndex = endIndex+1;
+			index++;
+			if (word.equals(".") & !endFlag) {
+				index = 0;
+				endFlag = true;
+			}
+		}
+		return oscarTokens;
+	}
+	
+    /***************************************************
+     * Creates a list of tokenSequences from the Oscar tokens. 
+     * @param surfaceText (String)
+     * @param oscarTokens (List<IToken>)
+     * @return tokSequenceList (List<TokenSequence>)
+     *****************************************************/
+	private List<TokenSequence> makeTokenSequences(String surfaceText,	List<Token> oscarTokens) {
+
+		TokenSequence tokSeq = new TokenSequence(surfaceText, 0, null, oscarTokens);
+		List<TokenSequence> tokSequenceList = new ArrayList<TokenSequence>();
+		tokSequenceList.add(tokSeq);
+		tokSequenceList = postProcess(tokSequenceList);
+
+		return tokSequenceList;
+	}
+	
+	/***************************************************
+     * PostProcessing the tokenSequenceList by adding tokenSequences to each token within the tokenList.
+     * The TokenSequences are used by the Oscar Tokens for lookahead.
+     * @param  tokSequenceList    (List<TokenSequence>)
+     * @return newTokSequenceList (List<TokenSequence>)
+     *****************************************************/
+	private List<TokenSequence> postProcess(List<TokenSequence> tokSequenceList) {
+		List<TokenSequence> newTokSequenceList = new ArrayList<TokenSequence>();
+		for (TokenSequence tokenSequence : tokSequenceList) {
+			for (Token token : tokenSequence.getTokens()) {
+				token.setTokenSequence(tokenSequence);
+			}
+			TokenSequence newTokenSequence = new TokenSequence(
+					tokenSequence.getSurface(), tokenSequence.getOffset(),
+					tokenSequence.getDoc(), tokenSequence.getTokens());
+			newTokSequenceList.add(newTokenSequence);
+		}
+		return newTokSequenceList;
+	}
+
 }

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/OscarTokeniser.java

 
 	/*****************************************************
 	 * Tokenises an inputText using OSCAR tokeniser. 
-	 * Sets the tokens to POSContainer's wordTokenList and
-	 * tokenSequenceList.
-	 * @param posContainer (POSContainer)
-	 * @return posContainer (POSContainer)
+	 * Returns a wordTokenList .
+	 * @param inputSentence (String)
+	 * @return List<String> 
 	 *****************************************************/
-	public POSContainer tokenise(POSContainer posContainer) {
+	public List<String> tokenise(String inputSentence) {
 		List<String> wordTokenList = new ArrayList<String>();
-		String sentence = posContainer.getInputText();
 		// Oscar doesn't do normalisation just yet
 		// sentence = oscar.normalise(sentence);
 
-		List<TokenSequence>  tokSequenceList = oscar.tokenise(sentence);
+		List<TokenSequence>  tokSequenceList = oscar.tokenise(inputSentence);
 		for (TokenSequence tokenSequence : tokSequenceList) {
 			for (Token token : tokenSequence.getTokens()) {
 
 
 			}
 		}
-		posContainer.setTokenSequenceList(tokSequenceList);
-		posContainer.setWordTokenList(wordTokenList);
-		return posContainer;
+		return wordTokenList;
 	}
 
 }

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/POSContainer.java

 
 import org.apache.commons.lang.StringUtils;
 
-import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
-
 /********************************************
  * A container class that stores the grammatical structure of the text .
  * 
 	}
 
 	private Element spectrumElementList;
-	private List<TokenSequence> tokenSequenceList;
 
 	/******************************
 	 * Default constructor method.
 		return wordTokenList;
 	}
 
-	/**************************************
-	 * Setter method for TokenSequenceList.
-	 * 
-	 * @param tokenSequenceList (List<TokenSequence>)
-	 ***************************************/
-	public void setTokenSequenceList(List<TokenSequence> tokenSequenceList) {
-		this.tokenSequenceList = tokenSequenceList;
-
-	}
-
-	/**************************************
-	 * Getter method for TokenSequenceList.
-	 * 
-	 * @return tokenSequenceList (List<TokenSequence>)
-	 ***************************************/
-	public List<TokenSequence> getTokenSequenceList() {
-		return tokenSequenceList;
-	}
-
 	
 
 	/**************************************

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/PostProcessTags.java

 				newTag = "NN-CHEMENTITY";
 			}
 		}
+		List<String> colours = Arrays.asList("amber", "bronze", "cream", "fawn", "gold", "ivory", "lavender", "tan");
+		if (colours.contains(currentToken.toLowerCase())){
+			if (stringAfter(Arrays.asList("nn-state"), i, combinedTags)){
+				newTag = "JJ";
+			}
+		}
+		
 		return newTag;
 	}
 

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/RegexTagger.java

 	}
 
 	/*********************************************************
-	 * Runs the regular expression tagger against the tokens.
-	 * @param posContainer (POSContainer)
-	 * @return posContainer (POSContainer)
+	 * Runs the regular expression tagger against a list of tokens and returns a list of tags
+	 * @param tokenList (List<String>)
+	 * @return tagList (List<String>)
 	/*********************************************************/
-	public List<String> runTagger(POSContainer posContainer) {
+	public List<String> runTagger(List<String> tokenList, String inputSentence) {
 
-		List<String> tokenList = posContainer.getWordTokenList();
-		List<String> regexTagList = new ArrayList<String>();
+		List<String> tagList = new ArrayList<String>();
 		for (String token : tokenList) {
 			try {
 				Matcher m = Pattern.compile("dummy").matcher(token);
 						break;
 					}
 				}
-				regexTagList.add(tag);
+				tagList.add(tag);
 
 			} catch (Exception e) {
 				LOG.debug("Null pointer right there" + tokenList);
 
 			}
 		}
-		posContainer.registerTagList(regexTagList);
-		return regexTagList;
+		return tagList;
 	}
 
 	public List<String> getIgnoredTags() {

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/Tagger.java

 	/*****************************************************
 	 * Runs the tagger against the text and stores the tags in
 	 * POSContainer.
+	 * @param inputSentence TODO
 	 * @param posContainer (POSContainer)
 	 * @return posContainer (POSContainer) 
 	 *****************************************************/
-	public List<String> runTagger(POSContainer posContainer);
+	public List<String> runTagger(List<String> tokenList, String inputSentence);
 
 	public List<String> getIgnoredTags() ;
 }

src/main/java/uk/ac/cam/ch/wwmm/chemicaltagger/WhiteSpaceTokeniser.java

 
 package uk.ac.cam.ch.wwmm.chemicaltagger;
 
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.LinkedList;
 import java.util.List;
 
-import uk.ac.cam.ch.wwmm.oscar.document.Token;
-import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
-
 /****************************************
  * A whitespace tokeniser to be used as a 
  * substitute to the OSCAR tokeniser.
 	public WhiteSpaceTokeniser(){
 		
 	}
-	
-	/********************************************
-	 * Converts a string into a list of tokens. 
-	 * @param inputText (String)
-	 * @return List<String>
-	 *****************************************/
-	private List<String> tokenise(String inputText){
-		return Arrays.asList(inputText.split("\\s+"));
-	}
-	
 
 	/********************************************
 	 * Tokenises a String on white space.
-	 * Requires a POSContainer.
-	 * @param  posContainer (POSContainer)
-	 * @return posContainer (POSContainer)
+	 * @param  inputSentence (String)
+	 * @return List<String>
 	 *****************************************/
-	public POSContainer tokenise(POSContainer posContainer){
-		List<String> wordTokenList = tokenise(posContainer.getInputText());
-		List<TokenSequence> tokenSequenceList = convertToOscarTokenSequences(wordTokenList, posContainer.getInputText());		
-		posContainer.setWordTokenList(wordTokenList);
-		posContainer.setTokenSequenceList(tokenSequenceList);
-		return posContainer;
+	public List<String> tokenise(String inputSentence){
+		return Arrays.asList(inputSentence.split("\\s+"));
 	}
 
-	/*********************************************
-	 * Converts a list of words into a list of Oscar TokenSequences.
-	 * @param wordTokenList (List<String>)
-	 * @param inputText (String)
-	 * @return tokenSequenceList (List<TokenSequence>)
-	 ********************************************/
-	private  List<TokenSequence> convertToOscarTokenSequences(List<String> wordTokenList, String inputText) {
-		List<Token> oscarTokens = convertWordlistToOscarTokens(wordTokenList);
-		List<TokenSequence> tokenSequenceList = makeTokenSequences(inputText, oscarTokens);
-		return tokenSequenceList;
-	}
-
-
-	/*************************************************
-	 * Converts a list of words into a list of Oscar Tokens.
-	 * @param wordTokenList (List<String>)
-	 * @return oscarTokens (List<Token>)
-	 ***********************************************/
-	private List<Token> convertWordlistToOscarTokens(List<String> wordTokenList) {
-
-		int index = 0;
-		int sentenceIndex = 0;
-		List<Token> oscarTokens = new LinkedList<Token>();
-		boolean endFlag = true;
-
-		for (String word : wordTokenList)  {
-			int startIndex = sentenceIndex;
-			int endIndex = sentenceIndex+word.length();
-			Token oscarToken = new Token(word, startIndex, endIndex, null, null, null);
-			
-			
-			oscarToken.setIndex(index);
-			oscarTokens.add(oscarToken);
-			sentenceIndex = endIndex+1;
-			index++;
-			if (word.equals(".") & !endFlag) {
-				index = 0;
-				endFlag = true;
-			}
-		}
-		return oscarTokens;
-	}
-	
-    /***************************************************
-     * Creates a list of tokenSequences from the Oscar tokens. 
-     * @param surfaceText (String)
-     * @param oscarTokens (List<IToken>)
-     * @return tokSequenceList (List<TokenSequence>)
-     *****************************************************/
-	private List<TokenSequence> makeTokenSequences(String surfaceText,	List<Token> oscarTokens) {
-
-		TokenSequence tokSeq = new TokenSequence(surfaceText, 0, null, oscarTokens);
-		List<TokenSequence> tokSequenceList = new ArrayList<TokenSequence>();
-		tokSequenceList.add(tokSeq);
-		tokSequenceList = postProcess(tokSequenceList);
-
-		return tokSequenceList;
-	}
-	
-	/***************************************************
-     * PostProcessing the tokenSequenceList by adding tokenSequences to each token within the tokenList.
-     * The TokenSequences are used by the Oscar Tokens for lookahead.
-     * @param  tokSequenceList    (List<TokenSequence>)
-     * @return newTokSequenceList (List<TokenSequence>)
-     *****************************************************/
-	private List<TokenSequence> postProcess(List<TokenSequence> tokSequenceList) {
-		List<TokenSequence> newTokSequenceList = new ArrayList<TokenSequence>();
-		for (TokenSequence tokenSequence : tokSequenceList) {
-			for (Token token : tokenSequence.getTokens()) {
-				token.setTokenSequence(tokenSequence);
-			}
-			TokenSequence newTokenSequence = new TokenSequence(
-					tokenSequence.getSurface(), tokenSequence.getOffset(),
-					tokenSequence.getDoc(), tokenSequence.getTokens());
-			newTokSequenceList.add(newTokenSequence);
-		}
-		return newTokSequenceList;
-	}
 }

src/main/java/uk/ac/cam/ch/wwmm/pregenerated/ChemicalChunkerLexer.java

-// $ANTLR 3.2 Sep 23, 2009 14:05:07 uk\\ac\\cam\\ch\\wwmm\\pregenerated\\ChemicalChunker.g 2011-07-11 16:57:19
+// $ANTLR 3.2 Sep 23, 2009 14:05:07 uk\\ac\\cam\\ch\\wwmm\\pregenerated\\ChemicalChunker.g 2011-07-14 18:42:37
 package uk.ac.cam.ch.wwmm.pregenerated;
 
 import org.antlr.runtime.*;