Hive复合数据类型

通过以下三种复合类型,Hive 能够高效处理半结构化数据(如日志、JSON),避免频繁进行表连接操作

array 数组

array<value数据类型>
存储同类型元素的有序集合,元素通过索引访问(从 0 开始)

1
2
3
4
5
--查询复合数据
select  a_score[0] from student2

--构造复合数据-array
select array(值,值) from student

struct 集合

struct<key值:value数据类型,key值:value数据类型>
存储不同类型字段的集合,每个字段有名称和类型,通过点号(.)访问

1
2
3
4
5
--查询复合数据
select  s_score.chinese from student2

--构造复合数据
select named_struct(key,value,key,value) from student

map 字典

map<key数据类型,value数据类型>
存储键值对(Key-Value)集合,键必须唯一,通过键访问值

1
2
3
4
5
--查询复合数据
select  m_score['语文'] from student2

--构造复合数据
select map(key,value,key,value) from student

Hive内置函数

1
2
3
4
5
6
7
8
# 查看系统自带的函数
show functions;

# 显示自带的函数的用法
desc function upper;

# 详细显示自带的函数的用法
desc function extended upper;

user_info.txt数据清洗脚本

pro_user_info.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json  
import re
from datetime import datetime

def clean_data(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:

for line in infile:
# 跳过空行
if not line.strip():
continue

# 分割字段
parts = line.strip().split('\t')
if len(parts) < 8:
continue

# 提取基础字段
user_id = parts[0]
name = parts[1]
sex = parts[2]
age = parts[3]
address = parts[4]
udate = parts[5]
lev = parts[6]
json_str = parts[7]

# 清洗日期字段 (保留日期部分)
try:
udate = datetime.strptime(udate, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
except ValueError:
udate = "1970-01-01" # 无效日期默认值

# 解析JSON字段 try:
info_data = json.loads(json_str)
except json.JSONDecodeError:
# 处理无效JSON格式
info_data = {
"systemtype": "unknown",
"education": "unknown",
"marriage_status": "0",
"phonebrand": "unknown"
}

# 构建复合数据类型
# 1. struct格式: 字段值逗号分隔 s_info = f"{info_data['systemtype']},{info_data['education']}," \
f"{info_data['marriage_status']},{info_data['phonebrand']}"

# 2. map格式: key:value 逗号分隔
m_info = ','.join([f"{k}:{v}" for k, v in info_data.items()])

# 3. array格式: 值按固定顺序逗号分隔
a_info = f"{info_data['systemtype']}," \
f"{info_data['education']}," \
f"{info_data['marriage_status']}," \
f"{info_data['phonebrand']}"

# 构建清洗后的行
cleaned_line = '\t'.join([
user_id, name, sex, age, address, udate, lev,
s_info, m_info, a_info
])

outfile.write(cleaned_line + '\n')

if __name__ == "__main__":
input_file = "user_info.txt"
output_file = "user_info_clean.txt"
clean_data(input_file, output_file)
print(f"数据清洗完成! 输出文件: {output_file}")

上传数据

1
2
3
hadoop dfs -mkdir /data/user  # 创建文件夹
hdfs dfs -ls /data/user # 查看文件夹
hadoop dfs -put -f user_info_clean.txt /data/user #上传数据文件 -f 覆盖上传

建表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
create external table user_info(
id int,
name string,
sex string,
age int,
address string,
udate date,
lev int,
s_info struct<systemtype:string, education:string, marriage_status:string, phonebrand:string>,
m_info map<string,string>,
a_info array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
location '/data/user';

练习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
--3.1 按月统计各个地区男女生人数
select
address `地区`,
date_format(udate,'yyyy-mm') `月份`,
count(case when sex='male' then id end) `男性人数`,
count(case when sex='female' then id end) `女性人数`
from user_info
group by address, date_format(udate, 'yyyy-mm')
order by `地区`,`月份`;

--3.2 统计各地区的不同手机型号使用人数,并按照老中青(35以下青年 男65岁以下女55岁以下中年 男65岁以上女55岁以上老年)年龄划分
select
address `地区`,
map("青",
count(if(s_info.systemtype = 'android' and
((sex = 'male' and age < 35) or (sex = 'female' and age < 35)), 1, null)),
"中",
count(if(s_info.systemtype = 'android' and
((sex = 'male' and age between 35 and 65) or (sex = 'female' and age between 35 and 55)), 1, null)),
"老",
count(if(s_info.systemtype = 'android' and
((sex = 'male' and age > 65) or (sex = 'female' and age > 55)), 1, null))
) `安卓使用人数`,

map(
"青",
count(if(s_info.systemtype = 'ios' and
((sex = 'male' and age < 35) or (sex = 'female' and age < 35)), 1, null)),
"中",
count(if(s_info.systemtype = 'ios' and
((sex = 'male' and age between 35 and 65) or (sex = 'female' and age between 35 and 55)), 1, null)),
"老",
count(if(s_info.systemtype = 'ios' and
((sex = 'male' and age > 65) or (sex = 'female' and age > 55)), 1, null))
) `ios使用人数`
from user_info
group by address;

--3.3 统计不同地区,不同学历,使用的手机品牌(去重)
select
address `地区`,
s_info.education `学历`,
collect_list(s_info.phonebrand) `手机品牌列表`
from user_info
group by address, s_info.education;

--3.4 统计不同等级,各个手机品牌的使用人数(需要行转列)
with brand_counts as (
select
lev,
s_info.phonebrand as phonebrand,
count(*) as cnt
from user_info
where s_info.phonebrand is not null
group by lev, s_info.phonebrand
)
select
lev,
str_to_map(concat_ws(',', collect_list(concat(phonebrand, ':', cnt)))) as phonebrand_list
from brand_counts
group by lev;