Commits

Jason Baldridge committed abe074f

Simple implementation of inverted index.

Comments (0)

Files changed (2)

project/plugins/project/build.properties

 #Project properties
-#Thu Jul 07 13:20:12 CDT 2011
+#Thu Aug 11 16:48:21 CDT 2011
 plugin.uptodate=true

src/main/java/fogbow/example/InvertedIndex.java

+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package fogbow.example;
+
+import java.io.IOException;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+import edu.umd.cloud9.io.pair.PairOfInts;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Collections;
+
+public class InvertedIndex {
+
+  public static class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, PairOfInts> {
+    
+    public void map(LongWritable key, Text value, Context context)
+    	throws IOException, InterruptedException {
+
+      HashMap<String, Integer> termFrequency = new HashMap<String, Integer>();
+      
+      StringTokenizer itr = new StringTokenizer(value.toString());
+      while (itr.hasMoreTokens()) {
+	String term = itr.nextToken();
+	if (termFrequency.containsKey(term))
+	  termFrequency.put(term, termFrequency.get(term).intValue() + 1);
+	else
+	  termFrequency.put(term, 1);
+      }
+
+      for (Iterator<String> termKeys = termFrequency.keySet().iterator(); termKeys.hasNext(); ) {
+	String term = termKeys.next();
+	PairOfInts posting = new PairOfInts((int)key.get(), termFrequency.get(term).intValue());
+        context.write(new Text(term), posting);
+      }
+    }
+  }
+  
+  public static class InvertedIndexReducer extends Reducer<Text,PairOfInts,Text,Text> {
+
+    public void reduce(Text key, Iterable<PairOfInts> values, Context context)
+    	throws IOException, InterruptedException {
+
+      ArrayList<PairOfInts> postings = new ArrayList<PairOfInts>();
+      int count = 0;
+      for (PairOfInts val : values) {
+	count += val.getRightElement();
+	postings.add(val.clone());
+      }
+      Collections.sort(postings);
+      
+      StringBuffer textRep = new StringBuffer().append(count);
+      for (PairOfInts val: postings)
+	textRep.append(":"+val.getLeftElement() + "," + val.getRightElement());
+
+      context.write(key, new Text(textRep.toString()));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    Configuration conf = new Configuration();
+    Job job = new Job(conf, "inverted index");
+    job.setJarByClass(InvertedIndex.class);
+    job.setMapperClass(InvertedIndexMapper.class);
+    //job.setCombinerClass(InvertedIndexReducer.class);
+    job.setReducerClass(InvertedIndexReducer.class);
+    job.setMapOutputValueClass(PairOfInts.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+    FileInputFormat.addInputPath(job, new Path(args[0]));
+    FileOutputFormat.setOutputPath(job, new Path(args[1]));
+    System.exit(job.waitForCompletion(true) ? 0 : 1);
+  }
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.