Anonymous avatar Anonymous committed 28e5373

Added Obama filtering code and changed build.sbt to add Hadoop dependency

Comments (0)

Files changed (2)

   "org.apache.opennlp" % "opennlp-tools" % "1.5.1-incubating",
   "org.clapper" %% "argot" % "0.3.5",
   "com.weiglewilczek.slf4s" %% "slf4s" % "1.0.7",
-  "org.scalatest" %% "scalatest" % "1.6.1" % "test"
+  "org.scalatest" %% "scalatest" % "1.6.1" % "test",
+  "org.apache.hadoop" % "hadoop-core" % "0.20.2"
   )
 
 // append several options to the list of options passed to the Java compiler

src/main/java/filter/ExtractObamaTweets.java

+package main.java.filter;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+/**
+ * <p>
+ * ExtractObamaTweets. This program extracts relevant tweets, and
+ * takes the following command-line arguments:
+ * </p>
+ * 
+ * <ul>
+ * <li>[input-path] input path</li>
+ * <li>[output-path] output path</li>
+ * <li>[num-reducers] number of reducers</li>
+ * </ul>
+ * 
+ * 
+ * Taken and adapted from edu.umd.cloud9.demo.DemoWordCount 
+ * 
+ * @author Shilpa Shukla
+ */
+public class ExtractObamaTweets extends Configured {
+
+	// mapper: emits (token, 1) for every word occurrence
+	private static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
+
+		@Override
+		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+		   
+		   String line = value.toString();
+		   if(Pattern.compile("obama", Pattern.CASE_INSENSITIVE).matcher(line).find()){
+			  context.write(key, value);
+		   }
+		}
+	}
+
+	// reducer: sums up all the counts
+	private static class MyReducer extends Reducer<LongWritable, Text, Text, Text> {
+
+			@Override
+			public void reduce(LongWritable key, Iterable<Text> values, Context context) 
+			throws IOException, InterruptedException {
+
+
+			context.write(values.iterator().next(), new Text(""));
+		}
+	}
+
+	public static void main(String[] args) throws Exception {
+		if (args.length == 3) {
+			String inputPath = args[0];
+			String outputPath = args[1];
+			int reduceTasks = Integer.parseInt(args[2]);
+
+			Configuration conf = new Configuration();
+			Job job = new Job(conf, "ExtractObamaTweets");
+			job.setJarByClass(ExtractObamaTweets.class);
+
+			job.setNumReduceTasks(reduceTasks);
+
+			FileInputFormat.setInputPaths(job, new Path(inputPath));
+			FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+			job.setOutputKeyClass(LongWritable.class);
+			job.setOutputValueClass(Text.class);
+
+			job.setMapperClass(MyMapper.class);
+			job.setReducerClass(MyReducer.class);
+
+			// Delete the output directory if it exists already
+			Path outputDir = new Path(outputPath);
+			FileSystem.get(conf).delete(outputDir, true);
+
+			long startTime = System.currentTimeMillis();
+			job.waitForCompletion(true);
+			}
+		}
+	}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.