天天看點

hive之Python UDF函數操作map資料 詳解 全是幹貨

#1、Hive基本操作: 檢視dw.full_h_usr_base_user的詳細資訊,可以擷取資料檔案的存放路徑 desc formatted dw.full_h_usr_base_user; dfs -ls dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user; 删除外表full_h_usr_base_user的資料 dfs -rmdir dfs -ls hdfs://BIGDATA:9000/user/hive/warehouse/dw.db/full_h_usr_base_user;

#192.168.1.181 192.168.1.1 #2、建立帶有map資料類型的外表 create external table dw.full_h_usr_base_user( user_id  string    comment '使用者id', reg_ip  string    comment 'ip', reg_ip_geo_map map<string,string> comment --map資料類型建立方法 'city_id,city_name,isp,province_id,province_name,country_id,country_name,postzip,district,province' ) comment '使用者測試表' partitioned by(ds string comment '目前時間,用于分區字段') row format delimited fields terminated by '\t' collection items terminated by ","--map鍵值對逗号分割 map keys terminated by ":"--map鍵值冒号分割 stored as TEXTFILE;--存儲為文本類型

#3、加載資料(指定user_id和reg_ip即可,reg_ip_geo_map可以通過UDF運算出來) load data local inpath '/opt/data/dw.full_h_usr_base_user.del'  overwrite into table dw.full_h_usr_base_user partition(ds='2017-09-25');

#4、自定義函數:Python UDF函數ip_to_num.py

#coding=utf-8      
#Version:python3.5.2      
#Tools:Pycharm      
#Date:      
__author__ = "Colby"
      
import socket      
import struct      
import sys,datetime      
ipDB='/opt/data/IP_utf-8.csv'
      
for line in sys.stdin:      
line = line.strip()      
user_id, reg_ip, reg_ip_geo_map, ds = line.split('\t')      
num_ip = int(socket.ntohl(struct.unpack("I", socket.inet_aton(str(reg_ip)))[0]))      
f = open(ipDB, 'r', encoding="utf-8")      
ipDict = {}      
count = 0
      
for line in f:      
if count == 9:      
count += 1
      
continue
      
line = line.split(',')      
if int(line[2]) <= num_ip and int(line[3]) >= num_ip:      
ipDict['IP'] = reg_ip      
ipDict['nationality'] = line[4]      
ipDict['province'] = line[5]      
ipDict['city'] = line[6]      
ipDict['Corporation'] = line[8]      
reg_ip_geo_map=str(ipDict)[1:-1].replace('\'','').replace(' ','')      
print('\t'.join([user_id, reg_ip, reg_ip_geo_map,ds]))      
f.close()      

#5、将udf函數檔案上傳檔案到伺服器指定目錄 /opt/udf/ /opt/udf/ip_to_num.py

#6、進入hive指令行,add檔案 add file /opt/udf/ip_to_num.py; #Added resources: [/opt/udf/ip_to_num.py]

#7、使用udf函數并進行測試 SELECT   TRANSFORM (user_id, reg_ip, reg_ip_geo_map, ds)   USING 'python3 ip_to_num.py'   AS (user_id, reg_ip, reg_ip_geo_map, ds) FROM dw.full_h_usr_base_user;

#8、函數處理資料,并且overwrite表dw.full_h_usr_base_user,注意動态分區參數 set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dw.full_h_usr_base_user partition(ds) select user_id ,reg_ip ,str_to_map(reg_ip_geo_map,',',':') as reg_ip_geo_map ,ds from ( SELECT   TRANSFORM (user_id, reg_ip, reg_ip_geo_map,ds)   USING 'python3 ip_to_num.py'   AS (user_id, reg_ip, reg_ip_geo_map,ds) FROM dw.full_h_usr_base_user ) as a ;

#9、查詢處理好的資料,學會查詢map類型的資料

select user_id ,reg_ip_geo_map['province']  ,reg_ip_geo_map['city']  ,reg_ip_geo_map['nationality']  from dw.full_h_usr_base_user where ds='2017-09-25' and user_id='1000000015';

輸出結果: OK 1000000015      安徽省  合肥市  中國 Time taken: 0.107 seconds, Fetched: 1 row(s) #動态分區,将字元創轉換成MAP #set hive.exec.dynamic.partition.mode=nonstrict; #insert into dw.full_h_usr_base_user partition(ds)  #select user_id #, reg_ip #, str_to_map(reg_ip_geo_map) reg_ip_geo_map #,ds from dw.full_h_usr_base_user_tmp;