hadoop上传数据

1
2
3
4
5
hadoop dfs -mkdir /emp
hadoop dfs -put emp0901.txt /emp/

hadoop dfs -mkdir /student
hadoop dfs -put student2.csv /student/

处理student2.txt表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json

def transform_student_data(input_file, output_file):
"""
将JSON格式的学生数据转换为CSV格式

参数:
input_file: 输入JSON数据文件路径
output_file: 输出CSV文件路径
"""
try:
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:

# 写入CSV表头
# outfile.write("name,id,class_name,age,chinese,math,english,sex\n")

for line in infile:
# 解析JSON数据
student = json.loads(line.strip())

# 提取并处理分数数据
scores = student["score"]
score_dict = {}
for subject_score in scores.split('|'):
subject, score = subject_score.split(':')
score_dict[subject] = score

# 构造CSV行
csv_line = f"{student['name']},{student['id']},{student['class_name']}," \
f"{student['age']},{score_dict.get('语文', '')}," \
f"{score_dict.get('数学', '')},{score_dict.get('英语', '')}," \
f"{student['sex']}\n"

# 写入CSV文件
outfile.write(csv_line)

print(f"转换成功! 结果已保存到: {output_file}")

except Exception as e:
print(f"处理过程中出错: {str(e)}")


# 使用示例
if __name__ == "__main__":
input_file = "student2.txt" # 包含JSON数据的输入文件
output_file = "student2.csv" # 输出CSV文件

transform_student_data(input_file, output_file)

建表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
--创建emp表
create external table emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal float,
comm float,
deptno int
)
row format delimited
fields terminated by ';'
lines terminated by '\n'
location '/emp'

--创建student表
create external table student(
name string,
id string,
class_name string,
age int,
chinese int,
math int,
english float,
sex string
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/student'

载入数据

1
2
3
4
5
6
7
8
9
--加local是取linux系统的路径,不加是取hdfs文件系统的路径
load data local inpath '/root/student.csv' into table student

--一般不用values
insert into student select * from student

--导出数据
insert overwrite local directory '/root/emp'
row format delimited fields terminated by '|' --分隔符

Linux环境安装python

1
2
3
4
5
6
7
8
9
# 解压
tar -zxvf Python-3.9.10.tgz -C /opt/module/
# 支撑包
yum install openssl-devel libffi-devel bzip2-devel gcc gcc-c++ wget -y
# 配置
cd /opt/module/Python-3.9.10/
./configure --enable-optimizations
# 安装
make altinstall

配置环境变量

1
2
3
4
5
6
7
8
9
10
11
12
13
# 配置环境变量
vim /etc/profile.d/my_env.sh

export PATH=$PATH:/opt/python39/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin

#生效 环境变量
source /etc/profile

#或者进行软连接 配置
cp libpython3.9.a /usr/lib64/

ln -s /opt/module/Python-3.9.10/python /usr/bin/python3
ln -s /usr/local/bin/pip3.9 /usr/bin/pip3

安装pyhive库

1
2
3
4
pip3 install  -i https://mirrors.aliyun.com/pypi/simple/  thrift
pip3 install -i https://mirrors.aliyun.com/pypi/simple/ thrift-sasl
pip3.9 install -i https://mirrors.aliyun.com/pypi/simple/ PyHive
pip3.9 install -i https://mirrors.aliyun.com/pypi/simple/ PyM

使用测试

1
2
3
4
5
6
7
from pyhive import hive

conn=hive.connect(host='localhost',port=10000,username='root',database='db_hive')
cursor=conn.cursor()
sql='show tables'
cursor.execute(sql)
print(cursor.fetchall())

练习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
--4.student2指标计算  
--4.1 统计各班 语文平均分,数学平均分,英语平均分 语文最高分,英语最高分,数学最高分
select
class_name,
round(avg(chinese), 2) avg_chinese,
round(avg(math), 2) avg_math,
round(avg(english), 2) avg_english,
max(chinese) max_chinese,
max(math) max_math,
max(english) max_english
from student
group by class_name

--4.2 统计各班男生语文最高分,男生数学最高分,男生英语最高分, 女生语文最高分, 女生数学最高分, 女生英语最高分
select
class_name,
max(case when sex = '男' then chinese else null end) mc,
max(case when sex = '男' then math else null end) mm,
max(case when sex = '男' then english else null end) me,
max(case when sex = '女' then chinese else null end) fmc,
max(case when sex = '女' then math else null end) fmm,
max(case when sex = '女' then english else null end) fme
from student
group by class_name

--4.3 统计每人平均分,总分,三科中的最高分,最高分科目名称,在本班的总分排名(有并列),判断是否在平均分线以上
select
name,
id,
class_name,
sex,
round((chinese+math+english)/3.0, 2) avg_score,
(chinese+math+english) total_score,
greatest(chinese, math, english) highest_score,
case greatest(chinese, math, english)
when chinese then '语文'
when math then '数学'
else '英语'
end highest_subject,
dense_rank() over (partition by class_name order by (chinese+math+english) desc) class_rank,
if((chinese+math+english) >= avg(chinese+math+english) over (partition by class_name), '是', '否') above_avg
from student
order by class_name, class_rank;

--4.4 统计每个学生总分并降序排序 ,显示本人总分和上一个学生分数的差值
select
name, id, class_name,
(chinese + math + english) total_score,
coalesce(lag(chinese + math + english) over (order by chinese + math + english desc) - (chinese + math + english), 0) cha_score
from student
order by total_score desc;