要求:计算hasgj表,计算每天新增mac数量。

因为spark直接扫描hbase表,对hbase集群访问量太大,给集群造成压力,这里考虑用spark读取HFile进行数据分析。

1、建立hasgj表的快照表:hasgjSnapshot

语句为:snapshot 'hasgj','hasgjSnapshot'

2、计算每天mac增量的代码如下:

package com.ba.sparkReadHbase.operatorHfile.hfileinputformat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableMap;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2; public class SparkReadHFile {
private static String convertScanToString(Scan scan) throws IOException {
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
return Base64.encodeBytes(proto.toByteArray());
} public static void main(String[] args) throws IOException {
final String date=args[0];
int max_versions = 3;
SparkConf sparkConf = new SparkConf().setAppName("sparkReadHfile");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
Configuration hconf = HBaseConfiguration.create();
hconf.set("hbase.rootdir", "/hbase");
hconf.set("hbase.zookeeper.quorum", "master,slave1,slave2");
Scan scan = new Scan();
scan.addFamily(Bytes.toBytes("ba"));
scan.setMaxVersions(max_versions);
hconf.set(TableInputFormat.SCAN, convertScanToString(scan));
Job job = Job.getInstance(hconf);
Path path = new Path("/snapshot");
String snapName ="hasgjSnapshot";
TableSnapshotInputFormat.setInput(job, snapName, path);
JavaPairRDD<ImmutableBytesWritable, Result> newAPIHadoopRDD = sc.newAPIHadoopRDD(job.getConfiguration(), TableSnapshotInputFormat.class, ImmutableBytesWritable.class,Result.class);
List<String> collect = newAPIHadoopRDD.map(new Function<Tuple2<ImmutableBytesWritable, Result>, String>(){
private static final long serialVersionUID = 1L;
public String call(Tuple2<ImmutableBytesWritable, Result> v1)
throws Exception {
// TODO Auto-generated method stub
String newMac =null;
Result result = v1._2();
if (result.isEmpty()) {
return null;
}
String rowKey = Bytes.toString(result.getRow());
//System.out.println("行健为:"+rowKey);
NavigableMap<byte[], byte[]> familyMap = result.getFamilyMap(Bytes.toBytes("ba"));
Set<Entry<byte[], byte[]>> entrySet = familyMap.entrySet();
java.util.Iterator<Entry<byte[], byte[]>> it = entrySet.iterator();
String colunNmae =null;
String minDate="34561213";
while(it.hasNext()){
colunNmae = new String(it.next().getKey());//列
if(colunNmae.compareTo(minDate)<0){
minDate=colunNmae;
}
} if (date.equals(minDate)) {
// row=rowKey.substring(4);
newMac=rowKey;
//ls.add(rowKey.substring(4));
//bf.append(rowKey+"----");
}
return newMac;
}
}).collect();
ArrayList<String> arrayList = new ArrayList<String>();
for (int i = 0; i < collect.size(); i++) {
if (collect.get(i) !=null) {
arrayList.add(collect.get(i));
}
}
System.out.println("新增mac数"+(arrayList.size())); }
}

3、特别说明:

hasgj表的表结构:

0000F470ABF3A587                          column=ba:20170802, timestamp=1517558687930, value=                                                                         
 0000F470ABF3A587                          column=ba:20170804, timestamp=1517593923254, value=                                                                         
 0000F470ABF3A587                          column=ba:20170806, timestamp=1517620990589, value=                                                                         
 0000F470ABF3A587                          column=ba:20170809, timestamp=1517706294758, value=                                                                         
 0000F470ABF3A587                          column=ba:20170810, timestamp=1517722369020, value=                                                                         
 0000F470ABF3A587                          column=ba:20170811, timestamp=1517796060984, value=                                                                         
 0000F470ABF3A587                          column=ba:20170816, timestamp=1517882948856, value=                                                                         
 0000F470ABF3A587                          column=ba:20170818, timestamp=1517912603602, value=                                                                         
 0000F470ABF3A587                          column=ba:20170819, timestamp=1517938488763, value=                                                                         
 0000F470ABF3A587                          column=ba:20170821, timestamp=1517989742180, value=                                                                         
 0000F470ABF3A587                          column=ba:20170827, timestamp=1518383470292, value=                                                                         
 0000F470ABF3A587                          column=ba:20170828, timestamp=1520305841272, value=                                                                         
 0000F470ABF3A587                          column=ba:20170831, timestamp=1522115116459, value=                                                                         
 0000F4730088A5D3                          column=ba:20170805, timestamp=1517598564121, value=                                                                         
 0000F47679E83F7D                          column=ba:20170817, timestamp=1517890046587, value=                                                                         
 0000F47FBA753FC7                          column=ba:20170827, timestamp=1518365792130, value=                                                                         
 0000F48C02F8EB83                          column=ba:20170810, timestamp=1517729864592, value=                                                                         
 0000F49578E63F55                          column=ba:20170828, timestamp=1520302223714, value=                                                                         
 0000F4AC4A93F7A5                          column=ba:20170810, timestamp=1517724545955, value=                                                                         
 0000F4B4807679AA                          column=ba:20170801, timestamp=1517543775374, value=                                                                         
 0000F4B7E374C0FF                          column=ba:20170804, timestamp=1517578239073, value=                                                                         
 0000F4BDBF6EBF37                          column=ba:20170829, timestamp=1520558747936, value=                                                                         
 0000F4CB52FDDA58                          column=ba:20170806, timestamp=1517638015583, value=                                                                         
 0000F4CB52FDDA58                          column=ba:20170807, timestamp=1517677405900, value=

4、提交作业命令:

./spark-submit --master yarn-client  --num-executors 7 --executor-cores 2 --driver-memory 2g  --executor-memory 30g --class com.ba.sparkReadHbase.operatorHfile.hfileinputformat.SparkReadHFile  /home/xxx0108/ftttttttt/testJar/sparkTest9.jar 20170806

最新文章

  1. iOS8.3发布了Swift 1.2带来哪些新变化
  2. PHP初步(上)
  3. ASP.NET MVC 多语言实现技巧 最简、最易维护和最快速开发
  4. nefu558 bfs
  5. 通过反射获取Android通知栏高度
  6. BZOJ 2241 打地鼠
  7. autolayout 总结
  8. mysql安装启动教程(两种方法)
  9. Oracle自动增长的序列号
  10. 玩转Web之servlet(三)---一张图看懂B/S架构
  11. Android开发之Intent.Action
  12. 基于arm开发板四个按键控制四个灯亮
  13. plsql中文乱码问题方案解决
  14. iOS监听模式系列之NSNotificationCenter的简单使用
  15. 20175212童皓桢 《Java程序设计》第六周学习总结
  16. jenkins 构建到最后报权限的问题
  17. ionic3 title 不居中问题
  18. bzoj4773: 负环
  19. 将 Smart 构件发布到 Maven 中央仓库
  20. linux 下查看c 函数帮助

热门文章

  1. Ubuntu 18.04 LTS 设置代理(系统代理;http 代理;sock5 代理;apt 代理 ...)
  2. yzoj 2377 颂芬梭哈 题解
  3. 第二次用map23333
  4. Redis—简介
  5. JAVA操作ORACLE大对象
  6. shim和polyfill 区别解释
  7. java输出月的日历控制台
  8. 【日语】【ZZ】日语人称小结
  9. 如何避免Linux操作系统客户端登陆超时-linux命令之TMOUT=
  10. kubernetes 应用快速入门