Hadoop实战-MapReduce之WordCount(五)
环境介绍:
主服务器ip:192.168.80.128(master) NameNode SecondaryNameNode ResourceManager
从服务器ip:192.168.80.129(slave1) DataNode NodeManager
从服务器ip: 192.168.80.130(slave2) DataNode NodeManager
1.文件准备
1)在HDFS上创建文件夹
hadoop fs -mkdir /user/joe/wordcount/input
2)在本地创建文件夹
mkdir /home/chenyun/data/mapreduce
3)创建file01
cd /home/chenyun/data/mapreduce
touch file01
vi file01
往file01写入内容:
Hello World, Bye World!
4)创建file02
cd /home/chenyun/data/mapreduce
touch file02 vi file02
往file02写入内容:
Hello Hadoop, Goodbye to hadoop.
5)把本地文件file01、file02上传到hdfs的/user/joe/wordcount/input目录
hadoop fs -put /home/chenyun/data/mapreduce/file01 /user/joe/wordcount/input hadoop fs -put /home/chenyun/data/mapreduce/file02 /user/joe/wordcount/input
2.编写mapreduce程序
1)在Eclipse编写Mapreduce程序
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils; public class WordCount { public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable> {
static enum CountersEnum {
INPUT_WORDS
} private final static IntWritable one = new IntWritable(1); private Text word = new Text();
private boolean caseSensitive;
private Set<String> patternsToSkip = new HashSet<String>(); private Configuration conf;
private BufferedReader fis; @Override
public void setup(Context context) throws IOException,
InterruptedException {
conf = context.getConfiguration();
caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
if (conf.getBoolean("wordcount.skip.patterns", false)) {
URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
for (URI patternsURI : patternsURIs) {
Path patternsPath = new Path(patternsURI.getPath());
String patternsFileName = patternsPath.getName().toString();
parseSkipFile(patternsFileName);
}
}
} private void parseSkipFile(String fileName) {
try {
fis = new BufferedReader(new FileReader(fileName));
String pattern = null;
while ((pattern = fis.readLine()) != null) {
patternsToSkip.add(pattern);
}
} catch (IOException ioe) {
System.err
.println("Caught exception while parsing the cached file '"
+ StringUtils.stringifyException(ioe));
}
} @Override
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = (caseSensitive) ? value.toString() : value.toString()
.toLowerCase();
for (String pattern : patternsToSkip) {
line = line.replaceAll(pattern, "");
}
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
Counter counter = context.getCounter(
CountersEnum.class.getName(),
CountersEnum.INPUT_WORDS.toString());
counter.increment(1);
}
} } public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
String[] remainingArgs = optionParser.getRemainingArgs();
if ((remainingArgs.length != 2) && (remainingArgs.length != 4)) {
System.err
.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class); List<String> otherArgs = new ArrayList<String>();
for (int i = 0; i < remainingArgs.length; ++i) {
if ("-skip".equals(remainingArgs[i])) {
job.addCacheFile(new Path(remainingArgs[++i]).toUri());
job.getConfiguration().setBoolean("wordcount.skip.patterns",
true);
} else {
otherArgs.add(remainingArgs[i]);
}
}
FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); System.exit(job.waitForCompletion(true) ? 0 : 1);
} }
2)导出mapreduce.jar
3) 上传到master的目录
/home/chenyun/project/mapreduce
3.运行wordCount
hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount /user/joe/wordcount/input /user/joe/wordcount/output
4)查看运行结果
hadoop fs -cat /user/joe/wordcount/output/part-r-00000
=======================================================================================================================
4.过滤不需要统计的字符
1)在本地创建/home/chenyun/data/mapreduce/patterns.txt ,在文件里加入
\.
\,
\!
to
2)把文件上传到hdfs上
hadoop fs -put /home/chenyun/data/mapreduce/patterns.txt /user/joe/wordcount
3)运行
hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount -Dwordcount.case.sensitive=true /user/joe/wordcount/input /user/joe/wordcount/output1 -skip /user/joe/wordcount/patterns.txt
4)查看运行结果
hadoop fs -cat /user/joe/wordcount/output1/part-r-00000
======================================================================================================================
5.忽略大小写,进行统计
1)运行
hadoop jar /home/chenyun/project/mapreduce/mapreduce.jar com.accp.mapreduce.WordCount -Dwordcount.case.sensitive=false /user/joe/wordcount/input /user/joe/wordcount/output5 -skip /user/joe/wordcount/patterns.txt
2)查看运行结果
hadoop fs -cat /user/joe/wordcount/output5/part-r-00000
最新文章
- 让 asp.net 在 mac 上飞
- .NET Core 1.1 发布 文档下载资源汇总
- IIS7错误“Web服务器被配置为不列出此目录的内容”的解决办法
- 使用Python一步一步地来进行数据分析总结
- 【风马一族_Android】第4章Android常用基本控件
- myeclipse自带客户端连接mysql数据库
- Erlang - Download and Install for Linux
- C#语法糖之开篇
- How ASP.NET MVC Works?
- # void :;
- CLR Profile解决内存占用过高
- How do I copy SQL Azure database to my local development server?(如何将Azure 中的数据库备份到本地)
- Maven通俗讲解
- Java Base64 加解密
- There are multiple modules with names that only differ in casing. 黄色warning
- Linux--Introduction and Basic commands(Part one)
- 初见《构建之法》orz……
- 对ajax中数据的得到以及绑定的认识
- oracle批量更新之使用游标进行分批次更新的5种方式及速度比对
- react-router4.0的使用
热门文章
- [Python Cookbook] Numpy Array Manipulation
- (五)github删除仓库
- char 转string
- C#深入学习:泛型修饰符in,out、逆变委托类型和协变委托类型
- Xcode: This device is no longer connected error
- tensorflow global_variables_initializer()
- SilverLight:基础控件使用(4)-日期显示和选择类控件
- IOS开发退出应用程序的代码
- intent 支持的action 动作
- Jquery:怎样让子窗体的div显示在父窗体之上