Hive复合数据类型

通过以下三种复合类型，Hive 能够高效处理半结构化数据（如日志、JSON），避免频繁进行表连接操作

array 数组

array<value数据类型>
存储同类型元素的有序集合，元素通过索引访问（从 0 开始）

--查询复合数据
select  a_score[0] from student2

--构造复合数据-array
select array(值,值) from student

struct 集合

struct<key值:value数据类型,key值:value数据类型>
存储不同类型字段的集合，每个字段有名称和类型，通过点号（.）访问

--查询复合数据
select  s_score.chinese from student2

--构造复合数据
select named_struct(key,value,key,value) from student

map 字典

map<key数据类型,value数据类型>
存储键值对（Key-Value）集合，键必须唯一，通过键访问值

--查询复合数据
select  m_score['语文'] from student2

--构造复合数据
select map(key,value,key,value) from student

Hive内置函数

# 查看系统自带的函数
show functions;

# 显示自带的函数的用法
desc function upper;

# 详细显示自带的函数的用法
desc function extended upper;

user_info.txt数据清洗脚本

pro_user_info.py

import json  
import re  
from datetime import datetime  
  
def clean_data(input_file, output_file):  
    with open(input_file, 'r', encoding='utf-8') as infile, \  
            open(output_file, 'w', encoding='utf-8') as outfile:  
  
        for line in infile:  
            # 跳过空行  
            if not line.strip():  
                continue  
  
            # 分割字段  
            parts = line.strip().split('\t')  
            if len(parts) < 8:  
                continue  
  
            # 提取基础字段  
            user_id = parts[0]  
            name = parts[1]  
            sex = parts[2]  
            age = parts[3]  
            address = parts[4]  
            udate = parts[5]  
            lev = parts[6]  
            json_str = parts[7]  
  
            # 清洗日期字段 (保留日期部分)  
            try:  
                udate = datetime.strptime(udate, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')  
            except ValueError:  
                udate = "1970-01-01"  # 无效日期默认值  
  
            # 解析JSON字段            try:  
                info_data = json.loads(json_str)  
            except json.JSONDecodeError:  
                # 处理无效JSON格式  
                info_data = {  
                    "systemtype": "unknown",  
                    "education": "unknown",  
                    "marriage_status": "0",  
                    "phonebrand": "unknown"  
                }  
  
            # 构建复合数据类型  
            # 1. struct格式: 字段值逗号分隔            s_info = f"{info_data['systemtype']},{info_data['education']}," \  
                     f"{info_data['marriage_status']},{info_data['phonebrand']}"  
  
            # 2. map格式: key:value 逗号分隔  
            m_info = ','.join([f"{k}:{v}" for k, v in info_data.items()])  
  
            # 3. array格式: 值按固定顺序逗号分隔  
            a_info = f"{info_data['systemtype']}," \  
                     f"{info_data['education']}," \  
                     f"{info_data['marriage_status']}," \  
                     f"{info_data['phonebrand']}"  
  
            # 构建清洗后的行  
            cleaned_line = '\t'.join([  
                user_id, name, sex, age, address, udate, lev,  
                s_info, m_info, a_info  
            ])  
  
            outfile.write(cleaned_line + '\n')  
  
if __name__ == "__main__":  
    input_file = "user_info.txt"  
    output_file = "user_info_clean.txt"  
    clean_data(input_file, output_file)  
    print(f"数据清洗完成! 输出文件: {output_file}")

上传数据

1
2
3

hadoop dfs -mkdir /data/user  # 创建文件夹
hdfs dfs -ls /data/user  # 查看文件夹
hadoop dfs -put -f user_info_clean.txt /data/user #上传数据文件 -f 覆盖上传

建表

create external table user_info(
    id int,
    name string,
    sex string,
    age int,
    address string,
    udate date,
    lev int,
    s_info struct<systemtype:string, education:string, marriage_status:string, phonebrand:string>,
    m_info map<string,string>,
    a_info array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
location '/data/user';

练习

--3.1 按月统计各个地区男女生人数
select
    address `地区`,
    date_format(udate,'yyyy-mm') `月份`,
    count(case when sex='male' then id end) `男性人数`,
    count(case when sex='female' then id end) `女性人数`
from user_info
group by address, date_format(udate, 'yyyy-mm')
order by `地区`,`月份`;

--3.2 统计各地区的不同手机型号使用人数,并按照老中青(35以下青年 男65岁以下女55岁以下中年 男65岁以上女55岁以上老年)年龄划分
select
    address `地区`,
    map("青", 
        count(if(s_info.systemtype = 'android' and 
                ((sex = 'male' and age < 35) or (sex = 'female' and age < 35)), 1, null)),
        "中", 
        count(if(s_info.systemtype = 'android' and 
                ((sex = 'male' and age between 35 and 65) or (sex = 'female' and age between 35 and 55)), 1, null)),
        "老", 
        count(if(s_info.systemtype = 'android' and 
                ((sex = 'male' and age > 65) or (sex = 'female' and age > 55)), 1, null))
    ) `安卓使用人数`,
    
    map(
        "青", 
        count(if(s_info.systemtype = 'ios' and 
                ((sex = 'male' and age < 35) or (sex = 'female' and age < 35)), 1, null)),
        "中", 
        count(if(s_info.systemtype = 'ios' and 
                ((sex = 'male' and age between 35 and 65) or (sex = 'female' and age between 35 and 55)), 1, null)),
        "老", 
        count(if(s_info.systemtype = 'ios' and 
                ((sex = 'male' and age > 65) or (sex = 'female' and age > 55)), 1, null))
    ) `ios使用人数`
from user_info
group by address;

--3.3 统计不同地区,不同学历,使用的手机品牌(去重)
select
    address `地区`,
    s_info.education `学历`,
    collect_list(s_info.phonebrand) `手机品牌列表`
from user_info
group by address, s_info.education;

--3.4 统计不同等级,各个手机品牌的使用人数(需要行转列)
with brand_counts as (
  select
    lev,
    s_info.phonebrand as phonebrand,
    count(*) as cnt
  from user_info
  where s_info.phonebrand is not null
  group by lev, s_info.phonebrand
)
select
  lev,
  str_to_map(concat_ws(',', collect_list(concat(phonebrand, ':', cnt)))) as phonebrand_list
from brand_counts
group by lev;