- like與rlike差別:like 是通配符,rlike是正則
- 取消hive一些文法限制:set hive.mapred.mode=nonstrict;
- 設定Reduce-Task(增加運作速度):set mapred.reduce.tasks = 30;
- count(*)計算的時候包含了NULL值,而count(expr)則不包含空值,例如:
select count(*) from (select disctinct column from xxx) x
Select count(distinct column) from xxx
結果不一樣,因為count(distinct column)已經去null了
- 建表語句
use mid_trafficwisdom;
drop table if exists city_with_station;
create table if not exists city_with_station(
start_city_id bigint comment '起始城市ID',
start_city_name string comment '起始城市名',
end_city_id bigint comment '到達城市ID',
end_city_name string comment '到達城市名',
com_start_city_id bigint comment '公共起始城市ID',
com_start_city_name string comment '公共起始城市名',
com_end_city_id bigint comment '公共到達城市ID',
com_end_city_name string comment '公共到達城市名',
traffic_type string comment '交通類型',
section_station_set string comment '站到站集合',
create_time string comment '資料插入時間'
)comment '城市包含站-站集合'
location '/data/train/trafficwisdom/mid/city_with_station'
- 插入分區表
INSERT OVERWRITE TABLE mid_trafficwisdom.search_transfer_log PARTITION(push_date ='2019-01-29' )
SELECT from_city_name,to_city_name,start_date,traffic_type,from_station_name,to_station_name,from_station_code,to_station_code,send_time
from tmp_trafficwisdom.search_transfer_log_tmp where from_city_name is not null and to_city_name is not null
- 删除分區表:
ALTER TABLE mid_trafficwisdom.order_train_log DROP PARTITION (log_date = '2019-01-29');
//批量删除
ALTER TABLE ota_interface_check_info_log DROP PARTITION(log_date <='2017-08-10')
修改表名
ALTER TABLE name RENAME TO new_name
- 分區表批量插入分區表
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE mid_trafficwisdom.ml_cross_data PARTITION (log_date)
SELECT D.userid, D.city, D.category, D.from_place, D.to_place
, D.isclick, D.future_day, D.hour_time, D.banner_min_price, D.banner_min_time
, D.start_city_id, D.start_city_name, D.end_city_id, D.end_city_name, D.push_date
FROM tmp_trafficwisdom.ml_cross_data D
- 修改表字段
ALTER TABLE app_trafficwisdom.route_traffic_transfer_without_date CHANGE transfer_overtime transfer_overtime bigint comment '第一段途徑時間'
- ROW_NUMBER使用
SELECT transport_code, start_station_code, start_station_name, end_station_code, end_station_name
, run_time, price, create_date
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY transport_code, start_station_code, end_station_code ORDER BY create_date DESC) AS rn
FROM mid_trafficwisdom.section_ticket_price
WHERE traffic_type = 'F'
) A WHERE A.rn = 1
- 一行拆多行(explode)
SELECT *
FROM (
SELECT from_city_name, to_city_name, COUNT(*) AS search_num
, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
FROM mid_trafficwisdom.search_line_log
WHERE push_date = '2019-01-29'
AND from_city_name IS NOT NULL
AND to_city_name IS NOT NULL
GROUP BY from_city_name, to_city_name
) d
LATERAL VIEW explode(traffic_types) adTable AS traffic_type limit 100;
多列合并一列
SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
, end_station_name, seat_infos.seat_name, seat_infos.seat_price
FROM (
SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
, end_station_name
, array(
named_struct('seat_name', '硬座', 'seat_price', hard_seat_price),
named_struct('seat_name', '無座', 'seat_price', no_seat_price),
named_struct('seat_name', '軟卧上', 'seat_price', soft_sleeper_up_price),
named_struct('seat_name', '軟卧下', 'seat_price', soft_sleeper_down_price),
named_struct('seat_name', '硬卧上', 'seat_price', hard_sleeper_up_price),
named_struct('seat_name', '硬卧中', 'seat_price', hard_sleeper_mid_price),
named_struct('seat_name', '硬卧下', 'seat_price', hard_sleeper_down_price),
named_struct('seat_name', '二等座', 'seat_price', second_seat_price),
named_struct('seat_name', '一等座', 'seat_price', first_seat_price),
named_struct('seat_name', '商務座', 'seat_price', business_seat_price)) AS seat_info
FROM mid_trafficwisdom.ticket_train_log
WHERE duration_date >= '{today,yyyy-MM-dd}'
) M
LATERAL VIEW explode(seat_info) seat_info AS seat_infos
WHERE seat_infos.seat_price IS NOT NULL AND seat_infos.seat_price > 0;
- 資料傾斜設定
set hive.groupby.mapaggr.checkinterval=100000 ;--這個是group的鍵對應的記錄條數超過這個值則會進行分拆,值根據具體資料量設定
set hive.groupby.skewindata=true; --如果是group by過程出現傾斜 應該設定為true
set hive.skewjoin.key=100000; --這個是join的鍵對應的記錄條數超過這個值則會進行分拆,值根據具體資料量設定
set hive.optimize.skewjoin=true;--如果是join 過程出現傾斜 應該設定為true
-
split拆分
例如某個字段名value為:
抽取最後一個字段"1小時50分鐘:o498X0bh9ySoH2h3ijQWMDzYSrjk_852_010002_2_319527329_南通_商丘_2019-01-29_1_367347825_780_1小時50分鐘"
split(value,'_')[size(split(value, "_" )) - 1 ] as min_time,
-
regexp_extract抽取字段
如果想抽取1小時50分鐘中的"1"和"50"轉換成秒
原資料min_time可能為:1小時50分鐘,2時50分鐘,2時,35分鐘。如下:
if(regexp_extract(min_time,'^([0-9]*?)小?時.*$',1)>0,regexp_extract(min_time,'^([0-9]*?)小?時.*$',1),0)*3600+if(regexp_extract(min_time,'([0-9]*?)分鐘.*$',1)>0 ,regexp_extract(min_time,'([0-9]*?)分鐘.*$',1),0)*60 as min_time
- explode用法
SELECT from_city_name, to_city_name, search_num, traffic_type
FROM (
SELECT from_city_name, to_city_name, COUNT(*) AS search_num
, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
FROM mid_trafficwisdom.search_direct_log
WHERE log_date = '2019-02-26'
AND from_city_name IS NOT NULL
AND to_city_name IS NOT NULL
GROUP BY from_city_name, to_city_name
) d
LATERAL VIEW explode(traffic_types) adTable AS traffic_type
- 時間轉換
from_unixtime(cast (time as int) , 'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(cast(substring(time,1,10) as int),'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(unix_timestamp(date, 'yyyyMMdd'), 'yyyy-MM-dd') as create_date
to_date(string timestamp) //日期時間轉日期函數
select date_sub('2012-12-08',10) from dual; //2012-11-28
- 字元串拼接
CONCAT_WS('+',M.wtboscheduleno,M.wtbobusno) AS bus_code
-
json解析字段
wtodatasourcenew字段内容為:
{
"abTest": [{
"expNo": "20180716_JLH201807",
"expGroup": "Z"
}, {
"expNo": "20181210_BI",
"expGroup": "Z"
}, {
"expNo": "20190114_guolv",
"expGroup": "Z"
}, {
"expNo": "20181022_PaiXu",
"expGroup": "Z"
}],
"requestId": "c6e8bf08a64ffee900988b3124ee4442"
}
SELECT *,GET_JSON_OBJECT(wtodatasourcenew,'$.requestId')
FROM base_tcdctrafficwisdomtravelorder.wisdomtravelorder where wtocreatedate BETWEEN '2019-03-17 00:00:00' AND '2019-03-17 23:59:59'
LIMIT 100
- 取小數點
CASE
WHEN round(b.order_num * 1.0 / a.search_num, 6) IS NULL THEN 0
ELSE round(b.order_num * 1.0 / a.search_num, 6)
END AS allpercent