天天看點

Hive一些常用文法

  • like與rlike差別:like 是通配符,rlike是正則
  • 取消hive一些文法限制:set hive.mapred.mode=nonstrict;
  • 設定Reduce-Task(增加運作速度):set mapred.reduce.tasks = 30;
  • count(*)計算的時候包含了NULL值,而count(expr)則不包含空值,例如:
select count(*) from (select disctinct column from xxx) x
Select count(distinct column) from xxx
結果不一樣,因為count(distinct column)已經去null了
           
  • 建表語句
use mid_trafficwisdom;
drop table if exists city_with_station;
create table  if not exists city_with_station(
start_city_id bigint comment '起始城市ID',
start_city_name string comment '起始城市名',
end_city_id bigint comment '到達城市ID',
end_city_name string comment '到達城市名',
com_start_city_id bigint comment '公共起始城市ID',
com_start_city_name string comment '公共起始城市名',
com_end_city_id bigint comment '公共到達城市ID',
com_end_city_name string comment '公共到達城市名',
traffic_type string comment '交通類型',
section_station_set string comment '站到站集合',
create_time string comment '資料插入時間'
)comment '城市包含站-站集合'
location '/data/train/trafficwisdom/mid/city_with_station'
           
  • 插入分區表
INSERT OVERWRITE TABLE mid_trafficwisdom.search_transfer_log PARTITION(push_date ='2019-01-29' )
SELECT from_city_name,to_city_name,start_date,traffic_type,from_station_name,to_station_name,from_station_code,to_station_code,send_time
from tmp_trafficwisdom.search_transfer_log_tmp where from_city_name is not null and to_city_name is not null
           
  • 删除分區表:
ALTER TABLE mid_trafficwisdom.order_train_log DROP PARTITION (log_date = '2019-01-29');
//批量删除
ALTER TABLE ota_interface_check_info_log DROP PARTITION(log_date <='2017-08-10')
           

修改表名

ALTER TABLE name RENAME TO new_name
           
  • 分區表批量插入分區表
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict; 
INSERT OVERWRITE TABLE mid_trafficwisdom.ml_cross_data PARTITION (log_date)
SELECT D.userid, D.city, D.category, D.from_place, D.to_place
	, D.isclick, D.future_day, D.hour_time, D.banner_min_price, D.banner_min_time
	, D.start_city_id, D.start_city_name, D.end_city_id, D.end_city_name, D.push_date
FROM tmp_trafficwisdom.ml_cross_data D 
           
  • 修改表字段
ALTER TABLE app_trafficwisdom.route_traffic_transfer_without_date CHANGE transfer_overtime transfer_overtime bigint comment '第一段途徑時間'
           
  • ROW_NUMBER使用
SELECT transport_code, start_station_code, start_station_name, end_station_code, end_station_name
	, run_time, price, create_date
FROM (
	SELECT *, ROW_NUMBER() OVER (PARTITION BY transport_code, start_station_code, end_station_code ORDER BY create_date DESC) AS rn
	FROM mid_trafficwisdom.section_ticket_price
	WHERE traffic_type = 'F'
) A WHERE A.rn = 1
           
  • 一行拆多行(explode)
SELECT *
FROM (
	SELECT from_city_name, to_city_name, COUNT(*) AS search_num
		, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
	FROM mid_trafficwisdom.search_line_log
	WHERE push_date = '2019-01-29'
		AND from_city_name IS NOT NULL
		AND to_city_name IS NOT NULL
	GROUP BY from_city_name, to_city_name
) d
	LATERAL VIEW explode(traffic_types) adTable AS traffic_type limit 100;
           

多列合并一列

SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
					, end_station_name, seat_infos.seat_name, seat_infos.seat_price
				FROM (
					SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
						, end_station_name
						, array(
                            named_struct('seat_name', '硬座', 'seat_price', hard_seat_price), 
                            named_struct('seat_name', '無座', 'seat_price', no_seat_price), 
                            named_struct('seat_name', '軟卧上', 'seat_price', soft_sleeper_up_price), 
                            named_struct('seat_name', '軟卧下', 'seat_price', soft_sleeper_down_price), 
                            named_struct('seat_name', '硬卧上', 'seat_price', hard_sleeper_up_price), 
                            named_struct('seat_name', '硬卧中', 'seat_price', hard_sleeper_mid_price), 
                            named_struct('seat_name', '硬卧下', 'seat_price', hard_sleeper_down_price), 
                            named_struct('seat_name', '二等座', 'seat_price', second_seat_price), 
                            named_struct('seat_name', '一等座', 'seat_price', first_seat_price), 
                            named_struct('seat_name', '商務座', 'seat_price', business_seat_price)) AS seat_info
					FROM mid_trafficwisdom.ticket_train_log
					WHERE duration_date >= '{today,yyyy-MM-dd}'
				) M
					LATERAL VIEW explode(seat_info) seat_info AS seat_infos
				WHERE seat_infos.seat_price IS NOT NULL	AND seat_infos.seat_price > 0;
           
  • 資料傾斜設定
set hive.groupby.mapaggr.checkinterval=100000 ;--這個是group的鍵對應的記錄條數超過這個值則會進行分拆,值根據具體資料量設定
set hive.groupby.skewindata=true; --如果是group by過程出現傾斜 應該設定為true
set hive.skewjoin.key=100000; --這個是join的鍵對應的記錄條數超過這個值則會進行分拆,值根據具體資料量設定
set hive.optimize.skewjoin=true;--如果是join 過程出現傾斜 應該設定為true
           
  • split拆分

    例如某個字段名value為:

    o498X0bh9ySoH2h3ijQWMDzYSrjk_852_010002_2_319527329_南通_商丘_2019-01-29_1_367347825_780_1小時50分鐘"

    抽取最後一個字段"1小時50分鐘:
split(value,'_')[size(split(value, "_" )) - 1 ] as min_time,
           
  • regexp_extract抽取字段

    如果想抽取1小時50分鐘中的"1"和"50"轉換成秒

    原資料min_time可能為:1小時50分鐘,2時50分鐘,2時,35分鐘。如下:

if(regexp_extract(min_time,'^([0-9]*?)小?時.*$',1)>0,regexp_extract(min_time,'^([0-9]*?)小?時.*$',1),0)*3600+if(regexp_extract(min_time,'([0-9]*?)分鐘.*$',1)>0 ,regexp_extract(min_time,'([0-9]*?)分鐘.*$',1),0)*60 as min_time
           
  • explode用法
SELECT from_city_name, to_city_name, search_num, traffic_type
FROM (
	SELECT from_city_name, to_city_name, COUNT(*) AS search_num
		, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
	FROM mid_trafficwisdom.search_direct_log
	WHERE log_date = '2019-02-26'
		AND from_city_name IS NOT NULL
		AND to_city_name IS NOT NULL
	GROUP BY from_city_name, to_city_name
) d
	LATERAL VIEW explode(traffic_types) adTable AS traffic_type
           
  • 時間轉換
from_unixtime(cast (time as int) , 'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(cast(substring(time,1,10) as int),'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(unix_timestamp(date, 'yyyyMMdd'), 'yyyy-MM-dd') as create_date
to_date(string timestamp)   //日期時間轉日期函數
select date_sub('2012-12-08',10) from dual;  //2012-11-28
           
  • 字元串拼接
CONCAT_WS('+',M.wtboscheduleno,M.wtbobusno) AS bus_code
           
  • json解析字段

    wtodatasourcenew字段内容為:

{
	"abTest": [{
		"expNo": "20180716_JLH201807",
		"expGroup": "Z"
	}, {
		"expNo": "20181210_BI",
		"expGroup": "Z"
	}, {
		"expNo": "20190114_guolv",
		"expGroup": "Z"
	}, {
		"expNo": "20181022_PaiXu",
		"expGroup": "Z"
	}],
	"requestId": "c6e8bf08a64ffee900988b3124ee4442"
}
           
SELECT *,GET_JSON_OBJECT(wtodatasourcenew,'$.requestId')
FROM base_tcdctrafficwisdomtravelorder.wisdomtravelorder where wtocreatedate  BETWEEN '2019-03-17 00:00:00' AND '2019-03-17 23:59:59'
LIMIT 100 
           
  • 取小數點
CASE 
	WHEN round(b.order_num * 1.0 / a.search_num, 6) IS NULL THEN 0 
	ELSE round(b.order_num * 1.0 / a.search_num, 6) 
END AS allpercent
           

繼續閱讀