时间:2021-07-01 10:21:17 帮助过:38人阅读
Connection conn = DriverManager.getConnection("jdbc:hive2://ydbmaster:10009/default", "hdfs", "");
Statement smst = conn.createStatement();
ResultSet rs = smst.executeQuery("/*ydb.pushdown(‘->‘)*/ select * from ydb_example_shu where ydbpartion = ‘3000w‘ limit 10 /*(‘<-‘)pushdown.ydb*/");
ResultSetMetaData m = rs.getMetaData();
int columns = m.getColumnCount();
for (int i = 1; i <= columns; i++) {
System.out.print(m.getColumnName(i));
System.out.print("\t\t");
}
while (rs.next()) {
for (int i = 1; i <= columns; i++) {
System.out.print(rs.getString(i));
System.out.print("\t\t");
}
System.out.println();
}
rs.close();
conn.close();
依赖的JDBC客户端jar包可以从这个地址获取,本质上就是HIVE的thrift接口,依赖的jar包也是Hive的jar包
http://url.cn/42R4CG8
SQL分析工具有很多,只要支持HIVE接口即可,免费的有Squirrel、收费的有DbVisualizer等
通过可视化报表分析工具,可以极大的提高程序开发的效率,只要是支持HIVE接口的可视化报表工具,都可以与YDB集成,下面以帆软报表为例。
随着时间的日积月累,单个索引会越来越大,从而导致系统瓶颈。YDB不会将全部的数据都完整的创建在一个索引中,YDB会对数据进行分区,分区的方式由用户来定义,可以按照日期分区,也可以按照某些固定的HASH方式来分区。
一条数据的分区,在导入的时候直接指定,具体请参考后面的数据导入用法。
如果按照日期进行分区,每天就会生成一个分区,如需查询哪天的数据,就去对应的分区中检索,其他的分区则闲置。
YDB 的SQL需要通过ydbpartion来指定分区; SQL查询必须要设置分区,而且要写在SQL的最外层。
如果没有指定ydbpartion分区的查询,ydb表默认会去查询 " ydb_default_partion" 这个分区,也就是说,如果我们真的不想进行数据分区,不想在sql上添加ydbpartion的条件来指定分区,那么请将数据都导入到 " ydb_default_partion"这个分区里面。
设置分区例子如下:
ydbpartion =‘20140928‘
ydbpartion in (‘20140928‘,‘20140927‘)
目前不支持大于等于,小于等于的范围指定分区,仅支持等于与in的方式。
l如果我们的数据可以按照时间进行切分,是不是切分的越细越好?
很遗憾,YDB并不适合特别多的分区,分区越多代表索引文件越多
1)YDB中打开一个索引是有很大的开销的,打开一个索引加载的列的信息、索引的BlockTree的相关主干节点等,需要消耗较多的内存,而且要持久化到内存里去维护这个索引的状态。这就是为什么大家会发现,对于一个表第一次查询会比较慢,但是我们进行一次count以后,在进行别的查询就会快很多。
2)YDB在一个进程里能够打开的索引数量是有限的,如果超过了打开的索引文件数量,那么就要关闭一些索引,以保证内存不会OOM。
3)小文件太多,对HDFS的NameNode的压力较大。
l那么分区粒度控制在多少为好?
基本原则就是在避免索引频繁的打开与关闭的情况下,索引粒度越小越好。
1)如果我们的数量不是很大,一年加在一起还不到10亿,那么我就建议采用按年分区。
2)如果我们的数据处于中等,每月的数据增量为1亿左右,那么我们建议按照季度分区。
3)如果我们的数据每天写入量特别大,如果按照月份分区,单个索引太大会造成写入瓶颈,那么我们建议按照天进行分区。
很多时候我们还可以根据不同的查询方式,采用两种粒度的分区
1)最近一两天的数据经常被查询,我们最近3天的数据按照天进行分区
2)但是偶尔也会发生查询整年的数据,如果采用按天分区的话,一次打开的索引太多,那么我们可以再加一个按照季度的分区。
3)按天的数据分区只保存最近7天的数据,超过7天的数据会通过insert的方式归档按照季度的分区里。
基本类型的存储方式均为 按列存储
YDB类型 |
只索引 |
只存储 |
Hive类型 |
解释 |
string |
synn |
--- |
string |
字符串类型,该类型不分词,通常用来存储比较短的字符串,如类目 |
tint |
tiynn |
--- |
int |
整形32位-适合大范围的range过滤查询 |
tlong |
tlynn |
--- |
bigint |
整形64位-适合大范围的range过滤查询 |
tdouble |
tdynn |
--- |
double |
Double类型-适合大范围的range过滤查询 |
tfloat |
tfynn |
--- |
float |
Float类型-适合大范围的range过滤查询 |
int |
iynn |
--- |
int |
整形32位,占用存储空间少,但是范围查找性能低 |
long |
lynn |
--- |
bigint |
整形64位,占用存储空间少,但是范围查找性能低 |
double |
dynn |
--- |
double |
Double类型,占用存储空间少,但是范围查找性能低 |
float |
fynn |
--- |
float |
Float类型,占用存储空间少,但是范围查找性能低 |
geopoint |
--- |
--- |
bigint |
用于地理位置搜索-使用方法详见《26.地理位置感知搜索.txt》 |
分词( Word Segmentation) 指的是将一个词字序列切分成一个一个单独的词。分词就是将连续的词序列按照一定的规范重新组合成词序列的过程.
分词类型,均为按行存储,在YDB中可以进行模糊检索,但是不能在SQL里面进行group by(YSQL函数以外是可以的)。
YDB类型 |
只索引 |
只存储 |
Hive类型 |
解释 |
simpletext |
simpletextyn |
simpletextny |
string |
ydb内置的普通文本分词 采用1~3元分词 |
haoma |
haomayn |
haomany |
string |
ydb内置的适合号码类型的分词,采用3~5元分词实现,分词粒度为char |
chepai |
chepaiyn |
chepainy |
string |
ydb内置的适合号码类型的分词,采用2~5元分词实现,分词粒度为char |
text |
tyn |
tny |
string |
为lucene默认的standard分词,在(处理手机号,邮箱,IP地址,网址等中英文与字典组合的数据上 不准确,请慎用) |
cjkyy |
cjkyn |
cjkny |
string |
为lucene默认的cjk分词即二元分词 (处理手机号,邮箱,IP地址,网址等中英文与字典组合的数据上 不准确,请慎用) |
以下类型除了分词外,还保存了分词后的词的顺序 ,可以进行顺序匹配
YDB类型 |
只索引 |
只存储 |
Hive类型 |
解释 |
charlike |
--- |
--- |
string |
按照字符char 1~5元分词 (效果较好,term区分了词元,适合车牌,手机号类型的较短文本) |
wordlike |
--- |
--- |
string |
按字与词 1~3元分词 (效果较好,term区分了词元,适合文本类型) |
pchepai |
--- |
--- |
string |
按照字符char 2~5元分词 |
phaoma |
--- |
--- |
string |
按照字符char 3~5元分词 |
psimpletext |
--- |
--- |
string |
按字与词 1~3元分词 |
pyy |
pyn |
pny |
string |
lucene的cjk分词,中文采用二元分词,英文与数字采用 单字分词 |
有些时候,我们想在一个列里面存储多个值的时候,就可以考虑使用多值列了
比如说,可以将一个人 的多个标签值 存储在一个记录里面,一个人的每天的行为数据 放在一个记录里面。
一定要注意,
1.字符串类型的多值列,返回的值的无序,并且是排重的,故这块有额外注意。
2.数值型的则是有序的(与导入的顺序一致),并且是没有排重的。
3.传递的数值是按照空格拆分的,如 11 22 33 44
4.如果传递的是空值,会当做null处理
多值列所有数据类型均为按列存储
YDB类型 |
Hive类型 |
解释 |
mt_syn |
string |
string类型的多值列 |
mt_tlyn |
string |
tlong类型的多值列 |
mt_lyn |
string |
long类型的多值列 |
mt_tdyn |
string |
tdouble类型的多值列 |
mt_dyn |
string |
double类型的多值列 |
mt_iyn |
string |
int类型的多值列 |
mt_tiyn |
string |
tint类型的多值列 |
mt_fyn |
string |
float类型的多值列 |
mt_tfyn |
string |
tfolat类型的多值列 |
/*ydb.pushdown(‘->‘)*/
create table ydb_example_shu(
phonenum long,
usernick string,
ydb_sex string,
ydb_province string,
ydb_grade string,
ydb_age string,
ydb_blood string,
ydb_zhiye string,
ydb_earn string,
ydb_prefer string,
ydb_consume string,
ydb_day string,
amtdouble tdouble,
amtlong int,
content textcjk
)
/*(‘<-‘)pushdown.ydb*/
通过ydbpartion表向YDB中导入数据,下面示例中的ydb_example_shu为YDB表的表名,3000w为YDB表的分区名。
insert into table ydbpartion
select ‘ydb_example_shu‘, ‘3000w‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_txt;
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘3000w‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_txt;
insert into table ydbpartion
select ‘ydb_example_shu‘, ‘3000w‘, ‘ydb_sex=‘男‘ and ydb_blood=‘A‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_txt;
#######为什么要控制并发数############
1)启动时候的Map数量不容易控制,如果启动的map数量很多,而Spark又没有容量调度器,会占满所有的资源,影响查询。
2)所以很多时候我们的业务期望,在进行数据导入的时候,不要启动太多的Map数量,而是希望留出一部分资源,能让给查询,于是控制Map数量就显得特别重要了。
3)我们导入数据,倾向于数据能更均衡一些,这样查询的时候,不会因为数据倾斜而影响性能。
4)针对大量小文件,Spark并没有像Hive那样使用了combine inputformat ,合并map查询,这样会导致启动的map数量很多,我们希望依然采用Hive那种能够将一些小的Map进行合并。
YDB提供了combine的方法,用来解决上述问题
类名为cn.NET.ycloud.ydb.handle.YdbCombineInputFormat (旧版名字为:cn.Net.ycloud.ydb.handle.Ya100FixNumCombineTextInputFormat)
1)####文本形式的示例####
drop table ydb_import_txt;
CREATE external table ydb_import_txt(
phonenum string, usernick string, ydb_sex string, ydb_province string, ydb_grade string, ydb_age string, ydb_blood string, ydb_zhiye string, ydb_earn string, ydb_prefer string, ydb_consume string, ydb_day string, amtdouble double,amtlong bigint,content string,multyvalue string
)
row format delimited fields terminated by ‘,‘
stored as
INPUTFORMAT ‘cn.net.ycloud.ydb.handle.YdbCombineInputFormat‘
OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat‘
location ‘/data/example/ydb‘
TBLPROPERTIES (
‘ydb.combine.input.format.raw.format‘=‘org.apache.hadoop.mapred.TextInputFormat‘
);
select count(*) from ydb_import_txt limit 10;
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘txt‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_txt;
/*ydb.pushdown(‘->‘)*/
select count(*) from ydb_example_shu where ydbpartion = ‘txt‘
/*(‘<-‘)pushdown.ydb*/
;
2)####RCFILE格式示例####
drop table ydb_import_rcfile;
CREATE external table ydb_import_rcfile(
phonenum string, usernick string, ydb_sex string, ydb_province string, ydb_grade string, ydb_age string, ydb_blood string, ydb_zhiye string, ydb_earn string, ydb_prefer string, ydb_consume string, ydb_day string, amtdouble double,amtlong bigint,content string
)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe‘
STORED AS
INPUTFORMAT ‘cn.net.ycloud.ydb.handle.YdbCombineInputFormat‘
OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.RCFileOutputFormat‘
TBLPROPERTIES (
‘ydb.combine.input.format.raw.format‘=‘org.apache.hadoop.hive.ql.io.RCFileInputFormat‘
);
insert overwrite table ydb_import_rcfile select * from ydb_import_txt;
select count(*) from ydb_import_rcfile limit 10;
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘rcfile‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_rcfile;
/*ydb.pushdown(‘->‘)*/
select count(*) from ydb_example_shu where ydbpartion = ‘rcfile‘
/*(‘<-‘)pushdown.ydb*/
;
3)####SEQUENCEFILE格式示例####
drop table ydb_import_sequencefile;
CREATE external table ydb_import_sequencefile(
phonenum string, usernick string, ydb_sex string, ydb_province string, ydb_grade string, ydb_age string, ydb_blood string, ydb_zhiye string, ydb_earn string, ydb_prefer string, ydb_consume string, ydb_day string, amtdouble double,amtlong bigint,content string
)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe‘
STORED AS
INPUTFORMAT ‘cn.net.ycloud.ydb.handle.YdbCombineInputFormat‘
OUTPUTFORMAT ‘org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat‘
TBLPROPERTIES (
‘ydb.combine.input.format.raw.format‘=‘org.apache.hadoop.mapred.SequenceFileInputFormat‘
);
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.DefaultCodec;
SET mapred.output.compression.type=BLOCK;
insert overwrite table ydb_import_sequencefile select * from ydb_import_txt;
select count(*) from ydb_import_sequencefile limit 10;
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘sequencefile‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_sequencefile;
/*ydb.pushdown(‘->‘)*/
select count(*) from ydb_example_shu where ydbpartion = ‘sequencefile‘
/*(‘<-‘)pushdown.ydb*/
4)####PARQUET格式示例####
###Spark内部对SERDE含有Parquet格式的类名进行了特殊处理,会导致设置的inputformat不生效,所以YDB也特殊处理下,就换成不含有Parquet的名字
drop table ydb_import_parquet;
CREATE external table ydb_import_parquet(
phonenum string, usernick string, ydb_sex string, ydb_province string, ydb_grade string, ydb_age string, ydb_blood string, ydb_zhiye string, ydb_earn string, ydb_prefer string, ydb_consume string, ydb_day string, amtdouble double,amtlong bigint,content string
)
ROW FORMAT SERDE ‘cn.net.ycloud.ydb.handle.combine.YdbParHiveSerDe‘
STORED AS
INPUTFORMAT ‘cn.net.ycloud.ydb.handle.YdbCombineInputFormat‘
OUTPUTFORMAT ‘cn.net.ycloud.ydb.handle.combine.YdbParMapredParquetOutputFormat‘
TBLPROPERTIES (
‘ydb.combine.input.format.raw.format‘=‘org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat‘
);
set parquet.block.size=16777216;
insert overwrite table ydb_import_parquet select * from ydb_import_txt;
select count(*) from ydb_import_parquet limit 10;
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘parquet‘, ‘‘,
YROW(
‘phonenum‘,phonenum,
‘usernick‘,usernick,
‘ydb_sex‘,ydb_sex,
‘ydb_province‘,ydb_province,
‘ydb_grade‘,ydb_grade,
‘ydb_age‘,ydb_age,
‘ydb_blood‘,ydb_blood,
‘ydb_zhiye‘,ydb_zhiye,
‘ydb_earn‘,ydb_earn,
‘ydb_prefer‘,ydb_prefer,
‘ydb_consume‘,ydb_consume,
‘ydb_day‘,ydb_day,
‘amtdouble‘,amtdouble,
‘amtlong‘,amtlong,
‘content‘,content
)
from ydb_import_parquet;
/*ydb.pushdown(‘->‘)*/
select count(*) from ydb_example_shu where ydbpartion = ‘parquet‘
/*(‘<-‘)pushdown.ydb*/
注意YDB的表强制必须指定分区
为了区分YDB表与Hive表,YDB语句需要使用
/*ydb.pushdown(‘->‘)*/ 与 /*(‘<-‘)pushdown.ydb*/ 前后包含起来,以方便解析
----count(*)计数
/*ydb.pushdown(‘->‘)*/
select count(*) from ydb_example_shu where ydbpartion = ‘2015‘
/*(‘<-‘)pushdown.ydb*/ ;
----数据预览
/*ydb.pushdown(‘->‘)*/
select * from ydb_example_shu where ydbpartion = ‘3000w‘ limit 10
/*(‘<-‘)pushdown.ydb*/;
----全文检索
/*ydb.pushdown(‘->‘)*/
select content,usernick from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘王老吉‘ limit 10
/*(‘<-‘)pushdown.ydb*/;
----多个条件组合过滤
/*ydb.pushdown(‘->‘)*/
select ydb_sex,ydb_grade,ydb_age,ydb_blood,amtlong from ydb_example_shu where ydbpartion = ‘3000w‘ and ydb_sex=‘女‘ and ydb_grade=‘本科‘ and (ydb_age=‘20到30岁‘ or ydb_blood=‘O‘) and (amtlong like ‘([3000 TO 4000] )‘) limit 10
/*(‘<-‘)pushdown.ydb*/;
----sum求和
/*ydb.pushdown(‘->‘)*/
select sum(amtdouble) from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-‘)pushdown.ydb*/;
----avg求平均数
/*ydb.pushdown(‘->‘)*/
select avg(amtdouble) as avgamt from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-‘)pushdown.ydb*/;
----更复杂点的统计
/*ydb.pushdown(‘->‘)*/
select count(*),count(amtdouble),avg(amtdouble),sum(amtdouble),min(amtdouble),max(amtdouble)
,min(ydb_province),max(ydb_province) from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-’)pushdown.ydb*/;
----单列group by
/*ydb.pushdown(‘->‘)*/
select ydb_sex,count(*),count(amtdouble),sum(amtdouble) from ydb_example_shu where ydbpartion = ‘3000w‘ group by ydb_sex limit 10
/*(‘<-‘)pushdown.ydb*/;
----多列group by
/*ydb.pushdown(‘->‘)*/
select ydb_sex,ydb_province,count(*) as cnt,count(amtdouble),sum(amtdouble) from ydb_example_shu where ydbpartion = ‘3000w‘ group by ydb_sex,ydb_province order by cnt desc limit 10
/*(‘<-‘)pushdown.ydb*/;
----top N 排序
/*ydb.pushdown(‘->‘)*/
select ydb_sex, phonenum,amtlong,amtdouble
from ydb_example_shu where ydbpartion=‘3000w‘ order by amtdouble desc ,amtlong limit 10
/*(‘<-‘)pushdown.ydb*/;
按照时间逆序排序可以说是很多日志系统的硬指标。在延云YDB系统中,我们改变了传统的暴力排序方式,通过索引技术,可以超快对数据进行单列排序,不需要全表暴力扫描,这个技术我们称之为BlockSort,目前支持tlong、tdouble、tint、tfloat四种数据类型。
由于BlockSort是借助搜索的索引来实现的,所以采用blockSort的排序,不需要暴力扫描,性能有大幅度的提升。
BlockSort的排序,并非是预计算的方式,可以全表进行排序,也可以基于任意的过滤筛选条件进行过滤排序。
|
正常写法 |
blockSort写法 |
单列升序 |
/*ydb.pushdown(‘->‘)*/ select tradetime, nickname from blocksort_ydb order by tradetime limit 10 /*(‘<-‘)pushdown.ydb*/;
|
/*ydb.pushdown(‘->‘)*/ select tradetime, nickname from blocksort_ydb where ydbkv=‘blocksort.field:tradetime‘ and ydbkv=‘blocksort.limit:10‘ order by tradetime limit 10 /*(‘<-‘)pushdown.ydb*/; |
单列降序 |
/*ydb.pushdown(‘->‘)*/ select tradetime, nickname from blocksort_ydb order by tradetime desc limit 10 /*(‘<-‘)pushdown.ydb*/; |
/*ydb.pushdown(‘->‘)*/ select tradetime, nickname from blocksort_ydb where ydbkv=‘blocksort.field:tradetime‘ and ydbkv=‘blocksort.limit:10‘ and ydbkv=‘blocksort.desc:true‘ order by tradetime desc limit 10 /*(‘<-‘)pushdown.ydb*/; |
----导出数据到hive表
insert overwrite table ydb_import_importtest
/*ydb.pushdown(‘->‘)*/
select phonenum,usernick,ydb_sex,ydb_province,
ydb_grade,ydb_age,ydb_blood,ydb_zhiye,ydb_earn,
ydb_prefer,ydb_consume,ydb_day,amtdouble,amtlong,content
from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-‘)pushdown.ydb*/;
#有limit的导出示例 (在Spark的当前版本有BUG,需要采用如下变通方法解决)
insert overwrite table ydb_import_importtest
select * from (
/*ydb.pushdown(‘->‘)*/
select
phonenum,usernick,ydb_sex,ydb_province,ydb_grade,ydb_age,ydb_blood,ydb_zhiye,ydb_earn,ydb_prefer,ydb_consume,ydb_day,amtdouble,amtlong,content
from ydb_example_shu where ydbpartion = ‘3000w‘ and ydbkv=‘export.max.return.docset.size:1000‘
/*(‘<-‘)pushdown.ydb*/
) tmp order by rand() limit 1000;
----数据导出到YDB的其他分区里示例
insert overwrite table ydbpartion
select ‘ydb_example_shu‘, ‘test3‘, ‘‘,
YROW(
‘phonenum‘,tmp.phonenum,
‘usernick‘,tmp.usernick,
‘ydb_sex‘,tmp.ydb_sex,
‘ydb_province‘,tmp.ydb_province,
‘ydb_grade‘,tmp.ydb_grade,
‘ydb_age‘,tmp.ydb_age,
‘ydb_blood‘,tmp.ydb_blood,
‘ydb_zhiye‘,tmp.ydb_zhiye,
‘ydb_earn‘,tmp.ydb_earn,
‘ydb_prefer‘,tmp.ydb_prefer,
‘ydb_consume‘,tmp.ydb_consume,
‘ydb_day‘,tmp.ydb_day,
‘amtdouble‘,tmp.amtdouble,
‘amtlong‘,tmp.amtlong,
‘content‘,tmp.content
)
from (
/*ydb.pushdown(‘->‘)*/
select
phonenum,usernick,ydb_sex,ydb_province,ydb_grade,ydb_age,ydb_blood,ydb_zhiye,ydb_earn,ydb_prefer,ydb_consume,ydb_day,amtdouble,amtlong,content
from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-‘)pushdown.ydb*/
) tmp
;
----导出数据到HDFS
由于Spark当前版本无法通过insert Directory的方式直接导出数据到HDFS,但是可以将数据导出到Hive表,故数据导出到HDFS可以通过导出到Hive表变通的方式来解决
可以通过创建一个导出表来解决
CREATE external table ydb_import_importtest(
phonenum bigint, usernick string, ydb_sex string, ydb_province string, ydb_grade string, ydb_age string, ydb_blood string, ydb_zhiye string, ydb_earn string, ydb_prefer string, ydb_consume string, ydb_day string, amtdouble double,amtlong int,content string
)location ‘/data/example/ydb_import_importtest‘;
如果我们创建表的时候,没有加location,我们可以通过show create table xxx表名 可以看到location的位置
1)---两个卡口left semi join
select k1.vehiclePlate as vehiclePlate from (
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k1
LEFT SEMI JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘2‘
/*(‘<-‘)pushdown.ydb*/
) k2
on (k1.vehiclePlate=k2.vehiclePlate);
+---------------+--+
| vehiclePlate |
+---------------+--+
| c22 |
| c23 |
| c33 |
| c34 |
+---------------+--+
2)---两个卡口left join
select k1.vehiclePlate as vehiclePlate,k2.vehiclePlate from (
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k1
LEFT JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k2
on (k1.vehiclePlate=k2.vehiclePlate);
+---------------+---------------+--+
| vehiclePlate | vehiclePlate |
+---------------+---------------+--+
| c11 | NULL |
| c22 | c22 |
| c23 | c23 |
| c33 | c33 |
| c34 | c34 |
+---------------+---------------+--+
3)---三个卡口left semi join
select k21.vehiclePlate from(
select k1.vehiclePlate as vehiclePlate from (
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k1
LEFT SEMI JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘2‘
/*(‘<-‘)pushdown.ydb*/
) k2
on (k1.vehiclePlate=k2.vehiclePlate)
) k21
LEFT SEMI JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘3‘
/*(‘<-‘)pushdown.ydb*/
) k22 on k21.vehiclePlate=k22.vehiclePlate order by k21.vehiclePlate;
+---------------+--+
| vehiclePlate |
+---------------+--+
| c33 |
| c34 |
+---------------+--+
4)---三个卡口left join
select k21.vehiclePlate,k22.vehiclePlate from(
select k1.vehiclePlate as vehiclePlate from (
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k1
LEFT JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘2‘
/*(‘<-‘)pushdown.ydb*/
) k2
on (k1.vehiclePlate=k2.vehiclePlate)
) k21
LEFT JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘3‘
/*(‘<-‘)pushdown.ydb*/
) k22 on k21.vehiclePlate=k22.vehiclePlate ;
+---------------+---------------+--+
| vehiclePlate | vehiclePlate |
+---------------+---------------+--+
| c11 | NULL |
| c22 | NULL |
| c23 | NULL |
| c33 | c33 |
| c34 | c34 |
+---------------+---------------+--+
5)----三个卡口 先left SEMI join 之后再 left join
select k21.vehiclePlate,k22.vehiclePlate from(
select k1.vehiclePlate as vehiclePlate from (
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘1‘
/*(‘<-‘)pushdown.ydb*/
) k1
LEFT SEMI JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘2‘
/*(‘<-‘)pushdown.ydb*/
) k2
on (k1.vehiclePlate=k2.vehiclePlate)
) k21
LEFT JOIN
(
/*ydb.pushdown(‘->‘)*/
select vehiclePlate,tollCode from vehiclepass where ydbpartion = ‘3000w‘ and tollCode=‘3‘
/*(‘<-‘)pushdown.ydb*/
) k22 on k21.vehiclePlate=k22.vehiclePlate ;
+---------------+---------------+--+
| vehiclePlate | vehiclePlate |
+---------------+---------------+--+
| c22 | NULL |
| c23 | NULL |
| c33 | c33 |
| c34 | c34 |
+---------------+---------------+--+
1)--union--统计的结果
select sum(cnt) as cnt from
(
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘3000w‘
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘300winsert‘
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘300winsert2‘
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘王老吉‘
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘20151011‘ and content=‘工商银行‘
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select count(*) as cnt from ydb_example_shu where ydbpartion = ‘20151011‘
/*(‘<-‘)pushdown.ydb*/
) tmp limit 10;
2)--union order by的结果,注意,这里有个子查询SQL
select * from
(
/*ydb.pushdown(‘->‘)*/ s
elect amtlong,content from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘旺旺‘ order by amtlong desc limit 1
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select amtlong,content from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘王老吉‘ order by amtlong desc limit 1
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select amtlong,content from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘汇源‘ order by amtlong desc limit 1
/*(‘<-‘)pushdown.ydb*/
union all
/*ydb.pushdown(‘->‘)*/
select amtlong,content from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘哇哈哈‘ order by amtlong desc limit 1
/*(‘<-‘)pushdown.ydb*/
) tmp limit 1000;
3)YDB表的多个分区一起查询,通过IN来实现
/*ydb.pushdown(‘->‘)*/
select count(*),count(amtdouble),sum(amtdouble),avg(amtdouble),min(amtdouble),max(amtdouble),min(ydb_province),max(ydb_province) from ydb_example_shu where ydbpartion in ( ‘3000w0‘,‘3000w1‘ ,‘3000w2‘,‘3000w3‘,‘3000w4‘,‘3000w5‘,‘3000w6‘,‘3000w7‘,‘3000w8‘,‘3000w9‘,‘3000w10‘ ,‘3000w11‘,‘3000w12‘,‘3000w13‘,‘3000w14‘,‘3000w15‘ ,‘3000w16‘ ,‘3000w17‘,‘3000w18‘,‘3000w19‘
,‘3000a0‘,‘3000a1‘ ,‘3000a2‘,‘3000a3‘,‘3000a4‘,‘3000a5‘,‘3000a6‘,‘3000a7‘,‘3000a8‘,‘3000a9‘,‘3000a10‘ ,‘3000a11‘,‘3000a12‘,‘3000a13‘,‘3000a14‘,‘3000a15‘ ,‘3000a16‘ ,‘3000a17‘,‘3000a18‘,‘3000a19‘
,‘3000b0‘,‘3000b1‘ ,‘3000b2‘,‘3000b3‘,‘3000b4‘,‘3000b5‘,‘3000b6‘,‘3000b7‘,‘3000b8‘,‘3000b9‘,‘3000b10‘ ,‘3000b11‘,‘3000b12‘,‘3000b13‘,‘3000b14‘,‘3000b15‘ ,‘3000b16‘ ,‘3000b17‘,‘3000b18‘,‘3000b19‘
)
/*(‘<-‘)pushdown.ydb*/
;
-----#####如果distinct的数据并不多,可以考虑采用collect_set 性能较好#######
1)----####直接count distinct##########
select
size(collect_set(tmp.ydb_sex)) as dist_sex,
size(collect_set(tmp.ydb_province)) as dist_province,
count(*) as cnt,
count(tmp.amtlong) as cnt_long,
count(distinct tmp.amtlong) as dist_long
from (
/*ydb.pushdown(‘->‘)*/
select ydb_sex,ydb_province,amtlong from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘王老吉‘
/*(‘<-‘)pushdown.ydb*/
) tmp limit 10;
2)----group by 加 count distinct####
select
tmp.ydb_sex as ydb_sex,
size(collect_set(tmp.ydb_province)) as dist_province,
count(*) as cnt,
count(tmp.amtlong) as cnt_long,
count(distinct tmp.amtlong) as dist_long
from
(
/*ydb.pushdown(‘->‘)*/
select ydb_sex,ydb_province,amtlong from ydb_example_shu where ydbpartion = ‘3000w‘ and content=‘王老吉‘
/*(‘<-‘)pushdown.ydb*/
) tmp
group by tmp.ydb_sex limit 10;
select ydb_sex,concat_ws(‘#‘, sort_array(collect_set(concat_ws(‘,‘,ydb_province,cnt,cntamt,sumamt)))) from (
/*ydb.pushdown(‘->‘)*/
select ydb_sex,ydb_province,count(*) as cnt,count(amtdouble) as cntamt,sum(amtdouble) as sumamt from ydb_example_shu where ydbpartion = ‘3000w‘ group by ydb_sex,ydb_province
/*(‘<-‘)pushdown.ydb*/
)tmp group by ydb_sex limit 10;
select ydb_province,sum(cnt) as scnt,concat_ws(‘#‘, sort_array(collect_s