hadoop上传数据

hadoop dfs -mkdir /emp
hadoop dfs -put emp0901.txt /emp/

hadoop dfs -mkdir /student
hadoop dfs -put student2.csv /student/

处理student2.txt表

import json

def transform_student_data(input_file, output_file):
    """
    将JSON格式的学生数据转换为CSV格式

    参数:
    input_file: 输入JSON数据文件路径
    output_file: 输出CSV文件路径
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile, \
                open(output_file, 'w', encoding='utf-8') as outfile:

            # 写入CSV表头
            # outfile.write("name,id,class_name,age,chinese,math,english,sex\n")

            for line in infile:
                # 解析JSON数据
                student = json.loads(line.strip())

                # 提取并处理分数数据
                scores = student["score"]
                score_dict = {}
                for subject_score in scores.split('|'):
                    subject, score = subject_score.split(':')
                    score_dict[subject] = score

                # 构造CSV行
                csv_line = f"{student['name']},{student['id']},{student['class_name']}," \
                           f"{student['age']},{score_dict.get('语文', '')}," \
                           f"{score_dict.get('数学', '')},{score_dict.get('英语', '')}," \
                           f"{student['sex']}\n"

                # 写入CSV文件
                outfile.write(csv_line)

        print(f"转换成功! 结果已保存到: {output_file}")

    except Exception as e:
        print(f"处理过程中出错: {str(e)}")


# 使用示例
if __name__ == "__main__":
    input_file = "student2.txt"  # 包含JSON数据的输入文件
    output_file = "student2.csv"  # 输出CSV文件

    transform_student_data(input_file, output_file)

建表

--创建emp表
create external table emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal float,
comm float,
deptno int
)
row format delimited
fields terminated by ';'
lines terminated by '\n'
location '/emp'

--创建student表
create external table student(
name string,
id string,
class_name string,
age int,
chinese int,
math int,
english float,
sex string
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/student'

载入数据

--加local是取linux系统的路径，不加是取hdfs文件系统的路径
load data local inpath '/root/student.csv' into table student

--一般不用values
insert into student select * from student

--导出数据
insert overwrite local directory '/root/emp'
row format delimited fields terminated by '|' --分隔符

Linux环境安装python

# 解压
tar -zxvf Python-3.9.10.tgz -C /opt/module/
# 支撑包
yum install openssl-devel libffi-devel bzip2-devel gcc gcc-c++ wget -y
# 配置
cd /opt/module/Python-3.9.10/
./configure --enable-optimizations
# 安装
make altinstall

配置环境变量

# 配置环境变量
vim  /etc/profile.d/my_env.sh

export PATH=$PATH:/opt/python39/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin

#生效 环境变量
source /etc/profile

#或者进行软连接 配置 
cp libpython3.9.a  /usr/lib64/

ln -s /opt/module/Python-3.9.10/python /usr/bin/python3
ln -s /usr/local/bin/pip3.9 /usr/bin/pip3

安装pyhive库

pip3 install  -i https://mirrors.aliyun.com/pypi/simple/  thrift
pip3 install  -i https://mirrors.aliyun.com/pypi/simple/  thrift-sasl
pip3.9 install   -i https://mirrors.aliyun.com/pypi/simple/ PyHive
pip3.9 install   -i https://mirrors.aliyun.com/pypi/simple/ PyM

使用测试

from pyhive import hive

conn=hive.connect(host='localhost',port=10000,username='root',database='db_hive')
cursor=conn.cursor()
sql='show tables'
cursor.execute(sql)
print(cursor.fetchall())

练习

--4.student2指标计算  
--4.1 统计各班 语文平均分,数学平均分,英语平均分 语文最高分,英语最高分,数学最高分  
select 
    class_name,
    round(avg(chinese), 2) avg_chinese,
    round(avg(math), 2) avg_math,
    round(avg(english), 2) avg_english,
    max(chinese) max_chinese,
    max(math) max_math,
    max(english) max_english
from student
group by class_name

--4.2 统计各班男生语文最高分,男生数学最高分,男生英语最高分, 女生语文最高分, 女生数学最高分, 女生英语最高分  
select 
    class_name,
    max(case when sex = '男' then chinese else null end) mc,
    max(case when sex = '男' then math else null end) mm,
    max(case when sex = '男' then english else null end) me,
    max(case when sex = '女' then chinese else null end) fmc,
    max(case when sex = '女' then math else null end) fmm,
    max(case when sex = '女' then english else null end) fme
from student
group by class_name

--4.3 统计每人平均分,总分,三科中的最高分,最高分科目名称,在本班的总分排名(有并列),判断是否在平均分线以上  
select 
    name,
    id,
    class_name,
    sex,
    round((chinese+math+english)/3.0, 2) avg_score,
    (chinese+math+english) total_score,
    greatest(chinese, math, english) highest_score,
    case greatest(chinese, math, english)
        when chinese then '语文'
        when math then '数学'
        else '英语'
    end highest_subject,
    dense_rank() over (partition by class_name order by (chinese+math+english) desc) class_rank,
    if((chinese+math+english) >= avg(chinese+math+english) over (partition by class_name), '是', '否') above_avg
from student
order by class_name, class_rank;

--4.4 统计每个学生总分并降序排序 ,显示本人总分和上一个学生分数的差值
select 
    name, id, class_name,
    (chinese + math + english) total_score,
    coalesce(lag(chinese + math + english) over (order by chinese + math + english desc) - (chinese + math + english), 0) cha_score
from student
order by total_score desc;