天天看點

利用hive完成阿裡天池大資料音樂預測比賽資料處理工作

hive shell

建立外表,指定目錄

CREATE EXTERNAL TABLE IF NOT EXISTS songs2 (

sid string,

aid string,

ptime string,

sinit int,

language int,

gender int) 

COMMENT 'This is the staging page view table'   

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION  '/bs/music/songs/'; 

導入HDFS檔案(原檔案消失)

LOAD DATA INPATH '/bs/music/input/mars_tianchi_songs.csv' OVERWRITE INTO TABLE songs2;

檢視前10條資料

select * from songs2 limit 10;

建立外表,指定目錄

CREATE EXTERNAL TABLE IF NOT EXISTS useraction (

uid string,

sid string,

btime string,

atype int,

ds string) 

COMMENT 'This is the staging page view table'   

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION  '/bs/music/useraction/'; 

導入HDFS檔案(原檔案消失)

LOAD DATA INPATH '/bs/music/input/mars_tianchi_user_actions.csv' OVERWRITE INTO TABLE useraction;

select * from useraction limit 10;

表連接配接 小表在前

SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

Map join連接配接

SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

導出查詢資料到hdfs 

INSERT OVERWRITE DIRECTORY '/bs/music/data' 

SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

查詢結果儲存到表

CREATE EXTERNAL TABLE IF NOT EXISTS usersongs (

sid string,

aid string,

ptime string,

sinit int,

language int,

gender int,

uid string,

sid2 string,

btime string,

atype int,

ds string) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION  '/bs/music/data/'; 

INSERT OVERWRITE TABLE usersongs 

SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

查詢結果儲存到本地

hive -e "select * from usersongs limit 10" >> /opt/tools/test.csv

檢視HDFS上檔案的前5行

hadoop fs -text  /bs/music/data/000000_0 |head -n 5

去重分組查詢 同一aid的uid去重總量,sid的去重總量

select count(distinct uid),count(distinct sid),aid from usersongs  where atype=1 group by aid;

繼續閱讀