老的API实现WordCount

使用Hadoop版本0.x实现单词统计
 package old;

 import java.io.IOException;

 import java.net.URI;

 import java.util.Iterator;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.FileSystem;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapred.FileInputFormat;

 import org.apache.hadoop.mapred.FileOutputFormat;

 import org.apache.hadoop.mapred.JobClient;

 import org.apache.hadoop.mapred.JobConf;

 import org.apache.hadoop.mapred.MapReduceBase;

 import org.apache.hadoop.mapred.Mapper;

 import org.apache.hadoop.mapred.OutputCollector;

 import org.apache.hadoop.mapred.Reducer;

 import org.apache.hadoop.mapred.Reporter;

 /**

  * 老API实现单词统计

  *

  */

 /**

  * hadoop版本1.x的包一般是mapreduce

  *

  * hadoop版本0.x的包一般是mapred

  *

  */

 public class OldApp {

     static final String INPUT_PATH = "hdfs://chaoren:9000/hello";

     static final String OUT_PATH = "hdfs://chaoren:9000/out";

     public static void main(String[] args) throws Exception {

         Configuration conf = new Configuration();

         FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);

         Path outPath = new Path(OUT_PATH);

         if (fileSystem.exists(outPath)) {

             fileSystem.delete(outPath, true);

         }

         /**

          * 改动1：不再使用Job，而是使用JobConf

          *

          * 改动2：类的包名不再使用mapreduce,而是使用mapred

          *

          * 改动3：不再使用job.waitForCompletion(true)提交作业，而是使用JobClient.runJob(job);

          */

         JobConf job = new JobConf(conf, OldApp.class);

         // 1.1指定读取的文件位于哪里

         FileInputFormat.setInputPaths(job, INPUT_PATH);

         // 指定如何对输入的文件进行格式化，把输入文件每一行解析成键值对

         // job.setInputFormatClass(TextInputFormat.class);

         // 1.2指定自定义的map类

         job.setMapperClass(MyMapper.class);

         // map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致，则可以省略

         // job.setOutputKeyClass(Text.class);

         // job.setOutputValueClass(LongWritable.class);

         // 1.3分区

         // job.setPartitionerClass(org.apache.hadoop.mapreduce.lib.partition.HashPartitioner.class);

         // 有一个reduce任务运行

         // job.setNumReduceTasks(1);

         // 1.4排序、分组

         // 1.5归约

         // 2.2指定自定义reduce类

         job.setReducerClass(MyReducer.class);

         // 指定reduce的输出类型

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(LongWritable.class);

         // 2.3指定写出到哪里

         FileOutputFormat.setOutputPath(job, outPath);

         // 指定输出文件的格式化类

         // job.setOutputFormatClass(TextOutputFormat.class);

         // 把job提交给jobtracker运行

         JobClient.runJob(job);

     }

     /**

      * 新API：extends Mapper

      *

      * 老API：extends MapReduceBase implements Mapper

      */

     static class MyMapper extends MapReduceBase implements

             Mapper<LongWritable, Text, Text, LongWritable> {

         public void map(LongWritable k1, Text v1,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             String[] split = v1.toString().split("\t");

             for (String word : split) {

                 collector.collect(new Text(word), new LongWritable(1));

             }

         }

     }

     static class MyReducer extends MapReduceBase implements

             Reducer<Text, LongWritable, Text, LongWritable> {

         public void reduce(Text k2, Iterator<LongWritable> v2s,

                 OutputCollector<Text, LongWritable> collector, Reporter reporter)

                 throws IOException {

             long times = 0L;

             while (v2s.hasNext()) {

                 long temp = v2s.next().get();

                 times += temp;

             }

             collector.collect(k2, new LongWritable(times));

         }

     }

 }
查看结果：
巴特西

老的API实现WordCount

最新文章

热门文章