--仍然使用row_number的例子;方便读者理解 select user_id, visit_time, visit_cnt, first_value(visit_time) over(partition by user_id order by visit_date desc) as first_value_time, row_number() over(partition by user_id order by visit_date desc) as rank from wedw_tmp.tmp_url_info order by user_id,rank
结果填充
last_value:使用频率 ★
按照分组排序取当前行的最后一个值;这个函数好像没啥卵用
1 2 3 4 5 6 7 8 9 10 11
--仍然使用row_number的例子;方便读者理解 select user_id, visit_time, visit_cnt, last_value(visit_time) over(partitionby user_id orderby visit_date desc) as first_value_time, row_number() over(partitionby user_id orderby visit_date desc) as rank from wedw_tmp.tmp_url_info orderby user_id,rank
select user_id, visit_time, visit_cnt, row_number() over(partition by user_id order by visit_date desc) as rank, lag(visit_time,1,'1700-01-01') over(partition by user_id order by visit_date desc) as lead_time from wedw_tmp.tmp_url_info order by user_id
集合相关
collect_set:使用频率 ★★★★★
将分组内的数据放入到一个集合中,具有去重的功能;
1 2 3 4 5 6
--统计每个用户具体哪些天访问过 select user_id, collect_set(visit_date) over(partition by user_id) as visit_date_set from wedw_tmp.tmp_url_info
填充结果
collect_list:使用频率 ★★★★★
和collect_set一样,但是没有去重功能
1 2 3 4
select user_id, collect_set(visit_date) over(partition by user_id) as visit_date_set from wedw_tmp.tmp_url_info
select visit_url, parse_url(visit_url, 'HOST') as url_host, --解析host parse_url(visit_url, 'PATH') as url_path, --解析path parse_url(visit_url, 'QUERY') as url_query,--解析请求参数 parse_url(visit_url, 'REF') as url_ref, --解析ref parse_url(visit_url, 'PROTOCOL') as url_protocol, --解析协议 parse_url(visit_url, 'AUTHORITY') as url_authority,--解析author parse_url(visit_url, 'FILE') as url_file, --解析filepath parse_url(visit_url, 'USERINFO') as url_user_info --解析userinfo from wedw_tmp.tmp_url_info
填充结果
reflect:使用频率 ★★
该函数是利用java的反射来实现一些功能,目前笔者只用到了关于url编解码
1 2 3 4 5
--url编码 select visit_url, reflect("java.net.URLEncoder", "encode", visit_url, "UTF-8") as visit_url_encode from wedw_tmp.tmp_url_info
填充结果
1 2 3 4 5 6 7 8 9 10 11
--url解码 select visit_url, reflect("java.net.URLDecoder", "decode", visit_url_encode, "UTF-8") as visit_url_decode from ( select visit_url, reflect("java.net.URLEncoder", "encode", visit_url, "UTF-8") as visit_url_encode from wedw_tmp.tmp_url_info )t
填充结果
JSON相关
get_json_object:使用频率 ★★★★★
通常用于获取json字符串中的key,如果不存在则返回null
1 2 3 4 5 6 7 8 9
select get_json_object(json_data,'$.user_id') as user_id, get_json_object(json_data,'$.age') as age --不存在age,则返回null from ( select concat('{"user_id":"',user_id,'"}') as json_data from wedw_tmp.tmp_url_info )t
--按照用户+访问日期统计统计次数 select user_id, visit_date, sum(visit_cnt) as visit_cnt from wedw_tmp.tmp_url_info group by user_id,visit_date grouping sets(user_id,visit_date)
--下图的结果类似于以下sql select user_id, NULL as visit_date, sum(visit_cnt) as visit_cnt from wedw_tmp.tmp_url_info union all select NULL as user_id, visit_date, sum(visit_cnt) as visit_cnt from wedw_tmp.tmp_url_info union all select user_id, visit_date, sum(visit_cnt) as visit_cnt from wedw_tmp.tmp_url_info
字符相关
concat:使用频率 ★★★★★
字符拼接,concat(string|binary A, string|binary B…);该函数比较简单
1 2
select concat('a','b','c') --最后结果就是abc
concat_ws:使用频率 ★★★★★
按照指定分隔符将字符或者数组进行拼接;concat_ws(string SEP, array)/concat_ws(string SEP, string A, string B…)
--提取csdn文章编号 select distinct regexp_extract(visit_url,'/details/([0-9]+)',1) as visit_url from wedw_tmp.tmp_url_info
substring_index:使用频率 ★★
substring_index(string A, string delim, int count)
截取第count分隔符之前的字符串,如count为正则从左边开始截取,如果为负则从右边开始截取
1 2 3 4 5 6 7 8 9 10
--比如将2020年的用户组合获取前2个用户,下面的sql将上面讲解的函数都结合在一起使用了 select user_set, substring_index(user_set,',',2) as user_id from ( select distinct concat_ws(',',collect_set(user_id) over(partition by year(visit_date))) as user_set from wedw_tmp.tmp_url_info )t
填充结果
条件判断
if:使用频率 ★★★★★
if(boolean testCondition, T valueTrue, T valueFalseOrNull):判断函数,很简单
--判断是否为user1用户 select distinct user_id, if(user_id='user1',true,false) as flag from wedw_tmp.tmp_url_info
填充结果
case when :使用频率 ★★★★★
CASE a WHEN b THEN c [WHEN d THEN e] [ELSE f] END
如果a=b就返回c,a=d就返回e,否则返回f 如CASE 4 WHEN 5 THEN 5 WHEN 4 THEN 4 ELSE 3 END 将返回4
相比if,个人更倾向于使用case when
1 2 3 4 5 6 7
--仍然以if上面的列子 select distinct user_id, case when user_id='user1' then 'true' when user_id='user2' then 'test' else 'false' end as flag from wedw_tmp.tmp_url_info
coalesce:使用频率 ★★★★★
COALESCE(T v1, T v2, …)
返回第一非null的值,如果全部都为NULL就返回NULL
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
--该函数结合lead或者lag更容易贴近实际业务需求,这里使用lead,并取后3行的值作为当前行值 select user_id, visit_time, rank, lead_time, coalesce(visit_time,lead_time) as has_time from ( select user_id, visit_time, visit_cnt, row_number() over(partition by user_id order by visit_date desc) as rank, lead(visit_time,3) over(partition by user_id order by visit_date desc) as lead_time from wedw_tmp.tmp_url_info order by user_id )t
数值相关
round:使用频率 ★★
round(DOUBLE a):返回对a四舍五入的BIGINT值,
round(DOUBLE a, INT d):返回DOUBLE型d的保留n位小数的DOUBLW型的近似值
select unix_timestamp() as current_timestamp,--获取当前时间戳 unix_timestamp('2020-09-01 12:03:22') as speical_timestamp,--指定时间对于的时间戳 from_unixtime(unix_timestamp(),'yyyy-MM-dd') as current_date --获取当前日期