Commits

Sara Magliacane  committed 260a028

adding 100 ngrams splitter

  • Participants
  • Parent commits 234bb97

Comments (0)

Files changed (5)

File src/main/java/nl/vu/recoprov/LuceneIndexer.java

 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 		
 		FSDirectory store = SimpleFSDirectory.open(indexDir);
 		IndexWriter writer = createIndexWriter(store);
+		
 
 		for (String name : input.keySet()) {
 

File src/main/java/nl/vu/recoprov/SentenceSplitter.java

+package nl.vu.recoprov;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+
+public class SentenceSplitter {
+	private SentenceDetectorME sentenceDetector = null;
+
+	public static final String SENTENCE_DIRECTORY = "sentences/";
+	public static final int DEFAULT_K = 100;
+	
+	public SentenceSplitter() {
+		initialize();
+	}
+		
+	public void initialize(){
+
+		InputStream modelIn = null;
+		SentenceModel model = null;
+		
+		File sentenceDir = new File (SENTENCE_DIRECTORY);
+		if (!sentenceDir.exists())
+			sentenceDir.mkdir();
+		
+		try {
+			modelIn = new FileInputStream("en-sent.bin");
+			model = new SentenceModel(modelIn);
+		} catch (IOException e) {
+			e.printStackTrace();
+			return;
+		} finally {
+			if (modelIn != null) {
+				try {
+					modelIn.close();
+				} catch (IOException e) {
+				}
+			}
+		}
+		
+		sentenceDetector = new SentenceDetectorME(model);
+	}
+	
+	
+	public String[] splitIntoStrings(String text){
+		return sentenceDetector.sentDetect(text);
+	}
+	
+	public void createFilePerSentence(String filename, String content){
+		int counter = 0;
+		FileWriter writer = null;
+		for (String s :splitIntoStrings(content)){
+			try {
+				writer = new FileWriter(new File(SENTENCE_DIRECTORY + filename+"_"+counter));
+				writer.append(s);
+			} catch (IOException e) {
+				e.printStackTrace();
+			} finally{
+				if (writer != null)
+					try{
+						writer.close();
+					} catch (IOException e){
+					}
+			}
+			counter++;
+		}
+	}
+
+	// may be better to use standard Lucene Shingles
+	public void createFileForKNgrams(String filename, String content){
+		int counter = 0;
+		int k = 0;
+		StringBuffer buffer = new StringBuffer();
+		FileWriter writer = null;
+		
+		for (String s : content.split(" ")){
+			
+			if (k < DEFAULT_K){
+				buffer.append(s + " ");
+				k++;
+			}
+			
+			else{			
+				try {
+					writer = new FileWriter(new File(SENTENCE_DIRECTORY + filename+"_"+counter));
+					writer.append(buffer);
+				} catch (IOException e) {
+					e.printStackTrace();
+				} finally{
+					if (writer != null)
+						try{
+							writer.close();
+						} catch (IOException e){
+						}
+				}
+				
+				buffer = new StringBuffer();
+				k = 0;
+				counter++;
+
+			}
+		}
+	}
+	
+	public static void main (String[] args){
+		SentenceSplitter s = new SentenceSplitter();
+		
+		String content = "prova. che voglia.";
+		
+		for (String string: s.splitIntoStrings(content))
+			System.out.println(string);
+
+		for (String string: content.split(" "))
+			System.out.println(string);
+	}
+
+}

File src/main/java/nl/vu/recoprov/TikaReader.java

 
 public class TikaReader {
 
-	String currentDir;
-	Tika tika = new Tika();
-	AutoDetectParser parser = new AutoDetectParser();
-	ParseContext context = new ParseContext();
-	String contentdirname = ConfigurationDefaults.CONTENTDIR;
+	private String currentDir;
+	private Tika tika = new Tika();
+	private AutoDetectParser parser = new AutoDetectParser();
+	private ParseContext context = new ParseContext();
+	private String contentdirname = ConfigurationDefaults.CONTENTDIR;
+	private SentenceSplitter sentenceSplitter = new SentenceSplitter();
 
 	public TikaReader(String dir) {
 		currentDir = dir;
 				out.write(textHandler.toString());
 				out.close();
 				
-				// System.out.println(f.getAbsolutePath());
-				// System.out.println(content);
+				// split sentences		
+				sentenceSplitter.createFileForKNgrams(filename, textHandler.toString());
 
 				tin.close();
 

File src/main/java/nl/vu/recoprov/experiments/Experiment3.java

 public class Experiment3 {
 
 	private static String dirfile = "pan12-detailed-comparison-training-corpus/";
+
 	private static String jsonfile = "pan.json";
 
 	private static DependencyGraph baselineGraph = null;
 	private static Logger logger;
 
+	public static final String RESULTS_DIRECTORY = "results/";
+	
 	public static void main(String[] args) throws Exception {
 
+		dirfile = "sentences/";
+		
 		logger = LoggerFactory
 				.getLogger("nl.vu.recoprov.experiments.Experiment3");
 
 			System.exit(0);
 		}
 		
+		File resultDir = new File (RESULTS_DIRECTORY);
+		if (!resultDir.exists())
+			resultDir.mkdir();
+		
 		// experimenting to get perfect recall
 		ConfigurationDefaults.LUCENE_MAX_NUMBER_DOCS = 6000;
 
 		DependencyGraph depGraphLuceneMore = createLuceneMoreGraph(pipeline);
 		
 		//add compression
-		DependencyGraph depGraphLuceneCompression = depGraphLucene
-				.copyGraph();
-		pipeline.compressionDistanceSignal(depGraphLuceneCompression);
+		//DependencyGraph depGraphLuceneCompression = createCompressionGraph(pipeline);
 	
 		double[] thresholds = { 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35,
 				0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9,
 				depGraph1 = depGraphLuceneThreshold.copyGraph();
 				pipeline.filterTextContainment(depGraph1);
 				pipeline.aggregateSignals(depGraph1);
-				writeResults(pipeline, depGraph, depGraph1, "LuceneFilter",
+				writeResults(pipeline, depGraph, depGraph1, "LuceneTextFilter",
 						writer);
 
 				depGraph1 = depGraphLuceneThreshold.copyGraph();
 				writeResults(pipeline, depGraph, depGraph1,
 						"LucenePAN2Filters", writer);
 				
-				depGraph1 = depGraphLuceneCompression.copyGraph();
-				pipeline.filterTopKEdges(depGraph1, CompressionDistanceSignal.COMPRESSION_DISTANCE);
-				pipeline.filterPlagiarismCorpus(depGraph1);
-				pipeline.filterLuceneThreshold(depGraphLuceneThreshold);
-				pipeline.aggregateSignals(depGraph1);
-				writeResults(pipeline, depGraph, depGraph1,
-						"LuceneCompressionPANFilter", writer);
+//				depGraph1 = depGraphLuceneCompression.copyGraph();
+//				pipeline.filterTopKEdges(depGraph1, CompressionDistanceSignal.COMPRESSION_DISTANCE);
+//				pipeline.filterPlagiarismCorpus(depGraph1);
+//				pipeline.filterLuceneThreshold(depGraphLuceneThreshold);
+//				pipeline.aggregateSignals(depGraph1);
+//				writeResults(pipeline, depGraph, depGraph1,
+//						"LuceneCompressionPANFilter", writer);
 				
 				
 				
 				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
 				pipeline.filterTextContainment(depGraph1);
 				pipeline.aggregateSignals(depGraph1);
-				writeResults(pipeline, depGraph, depGraph1, "LuceneMoreFilter",
+				writeResults(pipeline, depGraph, depGraph1, "LuceneMoreTextFilter",
 						writer);
 
 				depGraph1 = depGraphLuceneMoreThreshold.copyGraph();
 
 	}
 
+
 	private static void writeResults(CompletePipeline pipeline,
 			DependencyGraph depGraph, DependencyGraph predicted,
 			String message, FileWriter writer) throws FileNotFoundException {
 
 		logger.info("{} graph created, threshold {}", message,
 				ConfigurationDefaults.LUCENE_THRESHOLD);
-		pipeline.translateToPROVDM(predicted, "graph" + message + "_"
+		pipeline.translateToPROVDM(predicted, RESULTS_DIRECTORY+ "graph" + message + "_"
 				+ ConfigurationDefaults.LUCENE_THRESHOLD + ".gv");
 
 	}
 		return depGraphLuceneMore;
 	}
 
+
+	
 	public static DependencyGraph createReferenceGraph(CompletePipeline pipeline)
 			throws Exception {
 
 		return true;
 
 	}
+	
+	private static DependencyGraph createCompressionGraph(
+			CompletePipeline pipeline, DependencyGraph depGraphLucene) {
+		
+		DependencyGraph depGraph1 = DependencyGraph.deserializeDependencyGraph("depGraphCompression.ser");
+
+		if (depGraph1 != null)
+			return depGraph1;
+		
+		DependencyGraph depGraphLuceneCompression = depGraphLucene.copyGraph();
+		pipeline.compressionDistanceSignal(depGraphLuceneCompression);
+
+		try {
+			depGraphLuceneCompression.serialize("depGraphCompression.ser");
+			logger.info("Compression graph created.");
+		} catch (Exception e) {
+			e.printStackTrace();
+			logger.error("Could not create compression graph.");
+		}
+
+		return depGraphLuceneCompression;
+	}
 
 }

File src/main/java/nl/vu/recoprov/signaldetectors/CompressionDistanceSignal.java

 package nl.vu.recoprov.signaldetectors;
 import java.io.*;
 import java.util.zip.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
  
 
 import nl.vu.recoprov.abstractclasses.SignalDetector;
 import nl.vu.recoprov.baseclasses.DependencyGraph;
 import nl.vu.recoprov.baseclasses.DependencyNode;
+import nl.vu.recoprov.utils.ConfigurationDefaults;
 
 public class CompressionDistanceSignal extends SignalDetector {
 
 	public final static String COMPRESSION_DISTANCE ="compression-distance";
 	
+	private Logger logger;
+
+	public CompressionDistanceSignal() {
+		logger = LoggerFactory
+				.getLogger("nl.vu.recoprov.signaldetectors.CompressionDistanceSignal");
+	}
+
+	
 	@Override
 	public DependencyGraph computeSignal(DependencyGraph input) {
 		for (String name1 : input.keySet()) {
 			FileInputStream file1 = new FileInputStream(d1.getContent());
 			FileInputStream file2 = new FileInputStream(d2.getContent());
 			SequenceInputStream seq = new SequenceInputStream(file1, file2);
-			return zipFile(seq, d1.getId()+ "-"+ d2.getId()+ ".gz");
+			long result = zipFile(seq, d1.getId()+ "-"+ d2.getId()+ ".gz");
+			logger.info("Compression distance computed: " + d1.getId() + "->" + d2.getId());
+			return result;
 
 		} catch (FileNotFoundException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
-		
+			
 		return 0;
 
 	}