Hadoop：WordCount分析

相关代码：
 package com.hadoop;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import java.io.IOException;

 import java.util.StringTokenizer;

 public class WordCount {

     /**

      * Mapper接口是个泛型类型，它有4个形式参数类型，分别指定map函数的输入键、输入值、输出键和输出值的类型。

      * WordCount为例：输入键是一个长整数偏移量，输入的值是一行文本，输出的键是单词，输出的值是单词个数（整型）

      * Hadoop规定了自己的一套用于网络序列化的基本类型，而不直接使用Java内嵌的类型。这些类型在org.apache.hadoop.io包中。

      *      LongWritable类型相当于Java的Long类型

      *      Text类型相当于Java的String类型

      *      IntWritable类型相当于Java的Integer类型

      */

     public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

         private final static IntWritable one = new IntWritable(1);

         private Text word = new Text();

         /**

          *

          * @param key

          * @param value

          * @param context

          * @throws IOException

          * @throws InterruptedException

          * map( )方法的输入是一个键和一个值。首先使用StringTokenizer类将输入的Text值转换成String类型，然后使用nextToken( )方法将单词提取出来。

          * map( )方法还提供Context实例用于输出内容的写入。将单词数据按照Text类型进行读写，因为单词作为键。将单词数据数封装为IntWritable类型。

          */

         public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

             StringTokenizer itr = new StringTokenizer(value.toString());  // 有三个重载方法，这里以空白字符（“ ”，“\t”，“\n”）为分隔符分割字符串

             while (itr.hasMoreTokens()) {  // 判断是否还有分隔符

                 // set方法将String转换成Text

                 // nextToken返回当前位置到下一个分隔符位置的字符串

                 word.set(itr.nextToken());

                 context.write(word, one);   // 使用Context实例用于输出内容的写入

             }

         }

     }

     /**

      * reduce函数也有四个形式参数类型用于指定输入和输出类型。reduce函数的输入类型必须匹配map函数的输出类型：即Text类型和IntWritable类型。

      */

     public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

         private IntWritable result = new IntWritable();

         public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

             int sum = 0;

             for (IntWritable val : values) { // 遍历相同的key（单词）对应的values，并进行相加

                 sum += val.get();

             }

             result.set(sum);

             context.write(key, result); // 将统计的数目赋给每一个不同的单词

         }

     }

     public static void main(String[] args) throws Exception {

         /**

          * Configuration类是作业的配置信息类，任何作用的配置信息必须通过Configuration传递，

          * 因为通过Configuration可以实现在多个mapper和多个reducer任务之间共享信息。

          */

         Configuration conf = new Configuration();

         Job job = Job.getInstance(conf, "word count"); //Job对象制定作业执行规范，用它来控制整个作业的运行。

         /**

          * 在Hadoop集群上运行这个作业时，要把代码打包成一个JAR包，发布在集群上。

          * 不必明确指定JAR文件的名称，在Job对象的setJarByClass( )方法中传递一个类即可，Hadoop利用这个类查找包含它的JAR文件。

          */

         job.setJarByClass(WordCount.class);

         /**

          * setMapperClass( ) 和setReducerClass( )方法指定要用的map类型和reduce类型

          */

         job.setMapperClass(TokenizerMapper.class);

         job.setReducerClass(IntSumReducer.class);

         job.setCombinerClass(IntSumReducer.class);

         /**

          * setOutputKeyClass( ) 和setOutputValueClass( )方法控制reduce函数的输出类型，必须要和Reduce类产生的相匹配。

          * 输入的类型没有设置，因为使用了默认的TextInputFormat（文本输入格式）

          */

         job.setOutputKeyClass(Text.class);

         job.setOutputValueClass(IntWritable.class);

         /**

          * FileInputFormat类的静态方法addInputPath( )来指定输入数据的路径

          * 该路径可以是单个的文件、一个目录或符合特定文件模式的一系列文件。

          * '可以多次调用addInputPath( )来实现多路径的输入。

          */

         FileInputFormat.addInputPath(job, new Path(args[0]));

         /**

          * FileOutputFormat类中的静态方法setOutputPath( )来指定输出路径（只能有一个输出路径），即reduce函数输出文件的写入目录。

          * 在运行作业前该目录不能存在，否则Hadoop会报错并拒绝运行作业。

          * 目的：防止数据丢失，假如一个作业运行了很久才得出结果，现在被另一个作业不小心覆盖会令人崩溃。

          */

         FileOutputFormat.setOutputPath(job, new Path(args[1]));

         /**

          * waitForCompletion( )方法提交作业并等待执行完成。该方法的唯一参数是一个标识，指示是否已生成详细输出。

          */

         System.exit(job.waitForCompletion(true) ? 0 : 1);

     }

 }
运行结果：
巴特西

Hadoop：WordCount分析

最新文章

热门文章