package demos; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; public class LineIndexer{ public static class LineIndexMapper extends MapReduceBase implements Mapper { private final static Text word = new Text(); private final static Text location = new Text(); public void map(LongWritable key, Text val, OutputCollector output, Reporter reporter) throws IOException { // get the filename where this line came from FileSplit fileSplit = (FileSplit)reporter.getInputSplit(); String fileName = fileSplit.getPath().getName(); location.set(fileName); String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, location); } } } public static class LineIndexReducer extends MapReduceBase implements Reducer { private final static HashSet files = new HashSet(); public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { files.clear(); boolean first = true; StringBuilder toReturn = new StringBuilder(); while (values.hasNext()){ String filename = values.next().toString(); if( !files.contains(filename) ){ files.add(filename); if (!first) { toReturn.append(", "); } first = false; toReturn.append(filename); } } output.collect(key, new Text(toReturn.toString())); } } public static void run(String input, String output){ JobClient client = new JobClient(); JobConf conf = new JobConf(LineIndexer.class); conf.setJobName("LineIndexer"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.addInputPath(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); conf.setMapperClass(LineIndexMapper.class); conf.setReducerClass(LineIndexReducer.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { if( args.length != 2 ){ System.err.println("LineIndexer "); }else{ run(args[0], args[1]); } } }