Commits

Jason Baldridge  committed f723530 Merge

Merged.

  • Participants
  • Parent commits 6393b48, 73aa7ab

Comments (0)

Files changed (3)

File bin/email_when_run.sh

+#!/bin/bash -u
+
+#Subject: 	[dicta] The script to email yourself when your TACC job is ready
+#Date: 	Thu, 1 Sep 2011 14:29:54 -0500
+#From: 	Hohyon Ryu 
+#
+#Here's the script to email yourself when your TACC job is ready.
+#
+# Example use:
+#
+# cat qsub.sh
+#qstat
+#echo "Deleting .hadoop2 logs"
+#rm -rf ~/.hadoop2/logs
+#rm Hadoop*
+#echo "Submiting Job.."
+#qsub job.hadoop.new
+#email_when_run.sh &
+
+# input argument or replace with hard-coded
+if [ -z $EMAIL ] 
+then
+    EMAIL=$1
+fi
+
+AVAIL=`qstat | grep $USER | awk '{print $5}'`;
+
+echo "Waiting until hadoop runs...";
+while [ $AVAIL == "qw" ]; do
+       AVAIL=`qstat | grep $USER | awk '{print $5}'`;
+       sleep 10;
+done
+
+echo "It is running now!"
+echo "$AVAIL: `date` ";
+
+mail_cont="go ahead and do it!"
+mail -s "TACC Hadoop job is Running!" "$EMAIL" < Hadoop-test.out
+

File src/main/java/fogbow/example/CloudNineWordCount.java

+/*
+ * Cloud9: A MapReduce Library for Hadoop
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License. You may
+ * obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0 
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+package fogbow.example;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Logger;
+
+/**
+ * <p>
+ * Simple word count demo. This Hadoop Tool counts words in flat text file, and
+ * takes the following command-line arguments:
+ * </p>
+ * 
+ * <ul>
+ * <li>[input-path] input path</li>
+ * <li>[output-path] output path</li>
+ * <li>[num-reducers] number of reducers</li>
+ * </ul>
+ * 
+ * 
+ * Taken and adapted from edu.umd.cloud9.demo.DemoWordCount 
+ * 
+ * @author Jimmy Lin
+ */
+public class CloudNineWordCount extends Configured implements Tool {
+
+  private static final Logger sLogger = Logger.getLogger(CloudNineWordCount.class);
+
+  // mapper: emits (token, 1) for every word occurrence
+  private static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
+
+    // reuse objects to save overhead of object creation
+    private final static IntWritable one = new IntWritable(1);
+    private Text word = new Text();
+
+    @Override
+    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+      String line = ((Text) value).toString();
+      StringTokenizer itr = new StringTokenizer(line);
+      while (itr.hasMoreTokens()) {
+	word.set(itr.nextToken());
+	context.write(word, one);
+      }
+    }
+  }
+
+  // reducer: sums up all the counts
+  private static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
+
+    // reuse objects
+    private final static IntWritable SumValue = new IntWritable();
+
+    @Override
+    public void reduce(Text key, Iterable<IntWritable> values, Context context) 
+      throws IOException, InterruptedException {
+
+      // sum up values
+      Iterator<IntWritable> iter = values.iterator();
+      int sum = 0;
+      while (iter.hasNext()) {
+	sum += iter.next().get();
+      }
+      SumValue.set(sum);
+      context.write(key, SumValue);
+    }
+  }
+
+  /**
+   * Creates an instance of this tool.
+   */
+  public CloudNineWordCount() { }
+
+  private static int printUsage() {
+    System.out.println("usage: [input-path] [output-path] [num-reducers]");
+    ToolRunner.printGenericCommandUsage(System.out);
+    return -1;
+  }
+
+  /**
+   * Runs this tool.
+   */
+  public int run(String[] args) throws Exception {
+    if (args.length != 3) {
+      printUsage();
+      return -1;
+    }
+
+    String inputPath = args[0];
+    String outputPath = args[1];
+    int reduceTasks = Integer.parseInt(args[2]);
+
+    sLogger.info("Tool: CloudNineWordCount");
+    sLogger.info(" - input path: " + inputPath);
+    sLogger.info(" - output path: " + outputPath);
+    sLogger.info(" - number of reducers: " + reduceTasks);
+
+    Configuration conf = new Configuration();
+    Job job = new Job(conf, "CloudNineWordCount");
+    job.setJarByClass(CloudNineWordCount.class);
+
+    job.setNumReduceTasks(reduceTasks);
+
+    FileInputFormat.setInputPaths(job, new Path(inputPath));
+    FileOutputFormat.setOutputPath(job, new Path(outputPath));
+
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(IntWritable.class);
+
+    job.setMapperClass(MyMapper.class);
+    job.setCombinerClass(MyReducer.class);
+    job.setReducerClass(MyReducer.class);
+
+    // Delete the output directory if it exists already
+    Path outputDir = new Path(outputPath);
+    FileSystem.get(conf).delete(outputDir, true);
+
+    long startTime = System.currentTimeMillis();
+    job.waitForCompletion(true);
+    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0
+		 + " seconds");
+
+    return 0;
+  }
+
+  /**
+   * Dispatches command-line arguments to the tool via the
+   * <code>ToolRunner</code>.
+   */
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new CloudNineWordCount(), args);
+    System.exit(res);
+  }
+
+}

File src/main/scala/fogbow/util/StringUtil.scala

+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package fogbow.util;
+
+// A tokenizer that replaces all non-word characters with whitespace
+// and then returns a StringTokenizer.
+object CleanStringTokenizer {
+  import java.util.StringTokenizer
+
+  def apply (raw: String, doLowerCase: Boolean = true): StringTokenizer = {
+    val cleaned = raw.replaceAll("[^\\p{L}\\p{N}]", " ")
+    if (doLowerCase)
+      new StringTokenizer(cleaned.toLowerCase)
+    else
+      new StringTokenizer(cleaned)
+  }
+
+  // Need this since Java doesn't do defaults...
+  def apply (raw: String): StringTokenizer = apply(raw, true)
+
+}