项目需求

实现输入商品信息接口,提交商品后根据现有法律法规判断商品是否符合法律法规(暂定)

医药器械网站

NMPA:医疗器械法规文件
CMDE:国家药品监督管理局医疗器械技术审评中心—-法规文件
药智:政策法规数据库_药智数据

NMPA数据爬取(国家药品监督管理局)

首页:医疗器械法规文件
实现效果:爬取以上链接每篇文章的【索引号、标题、分类、日期、文章内容、链接】并导入数据库,若有附件则将其下载至对应路径(\attachments\nmpa\索引号\文章标题\),数据库保存其附件路径及附件数。

建表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
CREATE TABLE `hwz_nmpa` (
`id` INT NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`index_id` VARCHAR(50) NOT NULL COMMENT '索引号',
`title` VARCHAR(200) NOT NULL COMMENT '标题',
`category` VARCHAR(50) COMMENT '分类',
`post_date` DATE COMMENT '日期',
`content` TEXT COMMENT '文章内容',
`attachment_count` INT DEFAULT 0 COMMENT '附件数',
`attachment_path` VARCHAR(255) COMMENT '附件存储路径(D:\WorkSpace\RPA_project\attachments\nmpa\索引号\文章标题)',
`link` VARCHAR(255) COMMENT '文章链接',
`create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
`update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='国家药监局医疗器械法规文件数据表';

Python代码流程

  1. 分页抓取

    • get_list() 遍历分页(index.htmlindex_1.html…),提取每页的标题和链接。
    • 关键点:分页逻辑(第1页无编号,后续页为 index_{n-1}.html),每页间隔 2 秒防反爬。
  2. 详情页解析

    • get_page() 解析单篇文章,提取:
      • 核心字段:索引号(index_id)、标题、分类、日期、正文内容。
      • 基础 URL:从链接拆解出 https://域名/,用于拼接附件的相对路径。
  3. 附件下载

    • download_attachments() 处理附件:
      • 路径规范D:\...\nmpa\{index_id}\{安全标题}\(过滤特殊字符)。

      • 防重复:检查本地文件是否已存在,跳过已下载文件。

      • 分块下载stream=True 避免内存溢出,支持大文件。

  4. 数据存储

    • save_to_mysql() 存入数据库:
      • 去重逻辑:通过 index_id + link 判断记录是否存在,存在则更新,否则新增。
      • 事务管理:出错时回滚(rollback),确保数据一致性。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import xbot
import requests
import pymysql
from xbot import print, sleep,web
from .import package
from .package import variables as glv
from urllib.parse import urljoin
from datetime import datetime

# MySQL数据库配置
DB_CONFIG = {
'host': '192.168.200.129',
'user': 'root',
'password': 'root123456',
'database': 'test',
'charset': 'utf8mb4'
}

# 将数据保存到MySQL数据库
def save_to_mysql(index_id, title, category, post_date, content, attachment_count, link):

try:
# 创建数据库连接
connection = pymysql.connect(**DB_CONFIG)

# 处理日期格式
try:
post_date = datetime.strptime(post_date, '%Y-%m-%d').date()
except:
post_date = None

# 附件存储路径(包含index_id和title)
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()
attachment_path = rf"D:\WorkSpace\RPA_project\attachments\nmpa\{index_id}\{safe_title}"

with connection.cursor() as cursor:
# 检查记录是否已存在(同时判断index_id和link)
check_sql = """
SELECT 1 FROM hwz_nmpa
WHERE index_id = %s AND link = %s
"""
cursor.execute(check_sql, (index_id, link))
exists = cursor.fetchone()

if exists:
# 更新现有记录
sql = """
UPDATE hwz_nmpa SET
title = %s,
category = %s,
post_date = %s,
content = %s,
attachment_count = %s,
attachment_path = %s,
update_time = CURRENT_TIMESTAMP
WHERE index_id = %s AND link = %s
"""
cursor.execute(sql, (title, category, post_date, content,
attachment_count, attachment_path,
index_id, link))
print(f"更新数据库记录: {index_id} - {link}")
else:
# 插入新记录(包含link字段)
sql = """
INSERT INTO hwz_nmpa (
index_id, title, category, post_date,
content, attachment_count, attachment_path,
link
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (index_id, title, category, post_date,
content, attachment_count, attachment_path,
link))
print(f"新增数据库记录: {index_id} - {link}")

connection.commit()

except pymysql.Error as e:
print(f"数据库操作失败: {str(e)}")
if connection:
connection.rollback()
finally:
if connection:
connection.close()

# 下载附件
def download_attachments(web_object, index_id, title, base_url):

# 创建安全的标题目录名
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()

# 附件保存路径
base_dir = r"D:\WorkSpace\RPA_project\attachments"
save_dir = os.path.join(base_dir, index_id, safe_title)
os.makedirs(save_dir, exist_ok=True)

# 查找页面中的所有附件链接
attachment_links = web_object.find_all_by_xpath("//a[contains(@href, '.pdf') or contains(@href, '.doc') or contains(@href, '.xls') or contains(@href, '.zip') or contains(@href, '.rar')]")

downloaded_files = []

for link_element in attachment_links:
file_url = link_element.get_attribute("href")
file_name = link_element.get_text().strip()

# 处理相对路径
if not file_url.startswith("http"):
file_url = urljoin(base_url, file_url)

# 获取文件名
if not file_name or file_name == "":
file_name = os.path.basename(file_url)
else:
ext = os.path.splitext(file_url)[1]
if not file_name.endswith(ext):
file_name += ext

# 检查文件是否已存在
file_path = os.path.join(save_dir, file_name)
if os.path.exists(file_path):
print(f"附件已存在,跳过下载: {file_name}")
continue

try:
# 下载文件
print(f"正在下载附件: {file_name}")
response = requests.get(file_url, stream=True, timeout=30)
response.raise_for_status()

with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

downloaded_files.append(file_name)
print(f"附件下载完成: {file_name}")
except Exception as e:
print(f"下载附件失败: {file_name}, 错误: {str(e)}")

# 返回本次下载的文件列表和当前目录总文件数
total_files = len(os.listdir(save_dir)) if os.path.exists(save_dir) else 0
return downloaded_files, total_files

# 获取页面内容
def get_page(title, link):
print(f"正在获取:{title},{link}")
web_object = web.create(link, 'edge', load_timeout=20)

# 获取基本URL用于处理相对路径
base_url = "/".join(link.split("/")[:3]) + "/"

try:
# 索引号
index_id = web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[2]').get_text()
print(index_id)
# 标题
title = web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[2]/td[2]').get_text()
print(title)
# 分类
category = web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[4]').get_text()
print(category)
# 日期
post_date = web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[3]/td[2]').get_text()
print(post_date)
# 文章内容
content = web_object.find_by_xpath('/html/body/div[4]/div[5]/p[1]').get_text()
print(content)

# 下载附件
downloaded_files, attachment_count = download_attachments(web_object, index_id, title, base_url)
if downloaded_files:
print(f"成功下载 {len(downloaded_files)} 个附件,当前共有 {attachment_count} 个附件")
else:
print(f"没有新附件需要下载,当前共有 {attachment_count} 个附件")

# 存入数据库
save_to_mysql(index_id, title, category, post_date, content, attachment_count, link)

except Exception as e:
print(f"获取页面内容失败: {str(e)}")
finally:
web_object.close()


# 查找列表
def get_list():
base_url = 'https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index'
max_page = 44 # 爬取44页(包括首页)

for page in range(1, max_page + 1):
# 构建页面URL(第1页是index.html,第2页开始是index_1.html)
url = f"{base_url}.html" if page == 1 else f"{base_url}_{page-1}.html"
print(f"\n=== 开始爬取第 {page}/{max_page} 页 ===")

# 获取页面并提取条目元素
web_object = web.create(url, 'edge', load_timeout=20)
elements = web_object.find_all_by_xpath("/html/body/div[5]/div/div[2]/ul/li/a")

# 循环处理当前页所有条目
for element in elements:
title = element.get_text().strip()
link = element.get_attribute("href").replace("../../", "https://www.nmpa.gov.cn/")
# 爬取页面内容
get_page(title, link)

web_object.close()
sleep(2) # 每页间隔2秒避免请求过快

def main(args):
get_list()

效果验证

CMDE数据爬取(医疗器械技术评审中心)

首页:国家药品监督管理局医疗器械技术审评中心—-法规文件
实现效果:爬取以上链接每篇文章的【标题、文章来源、发布日期、文章内容、链接】并导入数据库,若有附件则将其下载至对应路径(\attachments\cmde\文章标题\),数据库保存其附件路径及附件数。
(为节省时间这里仅爬取5页作验证测试)

建表

1
2
3
4
5
6
7
8
9
10
11
12
13
CREATE TABLE `hwz_cmde` (
`id` INT NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`title` VARCHAR(200) NOT NULL COMMENT '标题',
`source` VARCHAR(50) COMMENT '文章来源',
`post_date` DATE COMMENT '发布时间',
`content` TEXT COMMENT '文章内容',
`attachment_count` INT DEFAULT 0 COMMENT '附件数',
`attachment_path` VARCHAR(255) COMMENT '附件存储路径(D:\WorkSpace\RPA_project\attachments\cmde\标题\)',
`link` VARCHAR(255) COMMENT '文章链接',
`create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
`update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='国家药监局医疗器械法规文件数据表';

Python代码流程

流程与nmpa一致,仅需对抓取方式及入库相关稍作修改

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import os
import re
import xbot
import requests
import pymysql
from xbot import print, sleep,web
from .import package
from .package import variables as glv
from urllib.parse import urljoin
from datetime import datetime

# MySQL数据库配置
DB_CONFIG = {
'host': '192.168.200.129',
'user': 'root',
'password': 'root123456',
'database': 'test',
'charset': 'utf8mb4'
}

# 将数据保存到MySQL数据库
def save_to_mysql(title, source, post_date, content, attachment_count, link):

try:
# 创建数据库连接
connection = pymysql.connect(**DB_CONFIG)

# 处理日期格式
try:
post_date = datetime.strptime(post_date, '%Y-%m-%d').date()
except:
post_date = None

# 附件存储路径(包含index_id和title)
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()
attachment_path = rf"D:\WorkSpace\RPA_project\attachments\cmde\{safe_title}"

with connection.cursor() as cursor:
# 检查记录是否已存在(同时判断index_id和link)
check_sql = """
SELECT 1 FROM hwz_cmde
WHERE link = %s
"""
cursor.execute(check_sql,link)
exists = cursor.fetchone()

if exists:
# 更新现有记录
sql = """
UPDATE hwz_cmde SET
title = %s,
source = %s,
post_date = %s,
content = %s,
attachment_count = %s,
attachment_path = %s,
update_time = CURRENT_TIMESTAMP
WHERE link = %s
"""
cursor.execute(sql, (title, source, post_date, content,
attachment_count, attachment_path, link))
print(f"更新数据库记录: {title} - {link}")
else:
# 插入新记录(包含link字段)
sql = """
INSERT INTO hwz_cmde (
title, source, post_date,
content, attachment_count, attachment_path,
link
) VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (title, source, post_date,
content, attachment_count, attachment_path,
link))
print(f"新增数据库记录: {title} - {link}")

connection.commit()

except pymysql.Error as e:
print(f"数据库操作失败: {str(e)}")
if connection:
connection.rollback()
finally:
if connection:
connection.close()

# 下载附件
def download_attachments(web_object, title, base_url):

# 创建安全的标题目录名
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()

# 附件保存路径
base_dir = r"D:\WorkSpace\RPA_project\attachments\cmde"
save_dir = os.path.join(base_dir, safe_title)
os.makedirs(save_dir, exist_ok=True)

# 查找页面中的所有附件链接
attachment_links = web_object.find_all_by_xpath("//a[contains(@href, '.pdf') or contains(@href, '.doc') or contains(@href, '.xls') or contains(@href, '.zip') or contains(@href, '.rar')]")

downloaded_files = []

for link_element in attachment_links:
file_url = link_element.get_attribute("href")
file_name = link_element.get_text().strip()

# 处理相对路径
if not file_url.startswith("http"):
file_url = urljoin(base_url, file_url)

# 获取文件名
if not file_name or file_name == "":
file_name = os.path.basename(file_url)
else:
ext = os.path.splitext(file_url)[1]
if not file_name.endswith(ext):
file_name += ext

# 检查文件是否已存在
file_path = os.path.join(save_dir, file_name)
if os.path.exists(file_path):
print(f"附件已存在,跳过下载: {file_name}")
continue

try:
# 下载文件
print(f"正在下载附件: {file_name}")
response = requests.get(file_url, stream=True, timeout=30)
response.raise_for_status()

with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

downloaded_files.append(file_name)
print(f"附件下载完成: {file_name}")
except Exception as e:
print(f"下载附件失败: {file_name}, 错误: {str(e)}")

# 返回本次下载的文件列表和当前目录总文件数
total_files = len(os.listdir(save_dir)) if os.path.exists(save_dir) else 0
return downloaded_files, total_files

# 获取页面内容
def get_page(title, link):
print(f"正在获取:{title},{link}")
web_object = web.create(link, 'edge', load_timeout=20)

# 获取基本URL用于处理相对路径
base_url = "/".join(link.split("/")[:3]) + "/"

try:
# 标题
title = web_object.find_by_xpath('/html/body/div[3]/h2').get_text()
print(title)
# 文章来源
sad = web_object.find_by_xpath('/html/body/div[3]/div[2]').get_text()
# 提取文章来源
source_match = re.search(r'文章来源:(.+?) 发布时间:', sad)
source = source_match.group(1) if source_match else None
print(source)

# 提取发布时间
date_match = re.search(r'发布时间:(\d{4}-\d{2}-\d{2})', sad)
post_date = date_match.group(1) if date_match else None
print(post_date)

# 文章内容
content = web_object.find_by_xpath('/html/body/div[3]/div[3]').get_text()
print(content)

# 下载附件
downloaded_files, attachment_count = download_attachments(web_object, title, base_url)
if downloaded_files:
print(f"成功下载 {len(downloaded_files)} 个附件,当前共有 {attachment_count} 个附件")
else:
print(f"没有新附件需要下载,当前共有 {attachment_count} 个附件")

# 存入数据库
save_to_mysql(title, source, post_date, content, attachment_count, link)

except Exception as e:
print(f"获取页面内容失败: {str(e)}")
finally:
web_object.close()


# 查找列表
def get_list():
base_url = 'https://www.cmde.org.cn/flfg/index'
max_page = 13 # 爬取13页(包括首页)

for page in range(1, max_page + 1):
# 构建页面URL(第1页是index.html,第2页开始是index_1.html)
url = f"{base_url}.html" if page == 1 else f"{base_url}_{page-1}.html"
print(f"开始爬取第 {page}/{max_page} 页")

# 获取页面并提取条目元素
web_object = web.create(url, 'edge', load_timeout=20)
elements = web_object.find_all_by_xpath("/html/body/div[3]/div[3]/div[2]/ul/li/a")

# 循环处理当前页所有条目
for element in elements:
title = element.get_text().strip()
link = element.get_attribute("href").replace("../", "https://www.cmde.org.cn/")
# 爬取页面内容
get_page(title, link)
# print(title, link)

web_object.close()
sleep(2) # 每页间隔2秒避免请求过快

def main(args):
get_list()

效果验证

药智数据爬取

首页:政策法规数据库_药智数据
实现效果:爬取以上链接每篇文章的【标题、发布部门、发布日期、发文字号、效力级别、时效性、文章内容、链接】并导入数据库,若有附件则将其下载至对应路径(\attachments\yaozhi\文章标题\),数据库保存其附件路径及附件数。

建表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
CREATE TABLE `hwz_yaozhi` (
`id` INT NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`title` VARCHAR(255) NOT NULL COMMENT '标题',
`dept` VARCHAR(50) COMMENT '发布部门',
`post_date` DATE COMMENT '发布日期',
`zihao` VARCHAR(50) COMMENT '发文字号',
`level` VARCHAR(50) COMMENT '效力级别',
`timeliness` VARCHAR(50) COMMENT '时效性',
`content` TEXT COMMENT '文章内容',
`attachment_count` INT DEFAULT 0 COMMENT '附件数',
`attachment_path` VARCHAR(255) COMMENT '附件存储路径(D:\WorkSpace\RPA_project\attachments\yaozhi\标题\)',
`link` VARCHAR(255) COMMENT '文章链接',
`create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
`update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='药智数据表';

Python代码流程

  1. 列表页处理

    • get_list()函数遍历指定页数的政策列表
    • 每页构建URL:https://db.yaozh.com/policies?p={page}.html
    • 使用XPath定位表格行(tr)元素
    • 从每行提取4个关键信息:
      • 标题
      • 链接(拼接完整URL)
      • 发布部门
      • 发布日期
  2. 详情页处理

    • 对每个列表项调用get_page()
    • 包含重试机制(最多4次尝试加载页面)
    • 提取关键信息:
      • 发文字号
      • 效力级别
      • 时效性
      • 文章内容(多种尝试策略)
  3. 附件处理

    • 调用download_attachments()下载所有附件
    • 创建安全目录名(过滤非法字符+长度限制)
    • 下载PDF/DOC/XLS/ZIP/RAR格式文件
    • 处理相对路径和文件名
  4. 数据存储

    • 调用save_to_mysql()保存所有数据到数据库
    • 包含更新/插入逻辑(基于链接判断是否为同一条数据)
    • 自动截断超长字段(标题255字符限制)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import os
import re
import xbot
import requests
import pymysql
from bs4 import BeautifulSoup
from xbot import print, sleep,web
from .import package
from .package import variables as glv
from urllib.parse import urljoin
from datetime import datetime

# MySQL数据库配置
DB_CONFIG = {
'host': '192.168.200.129',
'user': 'root',
'password': 'root123456',
'database': 'test',
'charset': 'utf8mb4'
}

# 将数据保存到MySQL数据库
def save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link):

try:
# 创建数据库连接
connection = pymysql.connect(**DB_CONFIG)

# 处理日期格式
try:
post_date = datetime.strptime(post_date, '%Y-%m-%d').date()
except:
post_date = None

# 截断标题到255字符(保留最后255字符)
title = title[:255] if len(title) > 255 else title

with connection.cursor() as cursor:
# 检查记录是否已存在(同时判断index_id和link)
check_sql = """
SELECT 1 FROM hwz_yaozhi
WHERE link = %s
"""
cursor.execute(check_sql,link)
exists = cursor.fetchone()

if exists:
# 更新现有记录
sql = """
UPDATE hwz_yaozhi SET
title = %s,
dept = %s,
post_date = %s,
zihao = %s,
level = %s,
timeliness = %s,
content = %s,
attachment_count = %s,
attachment_path = %s,
update_time = CURRENT_TIMESTAMP
WHERE link = %s
"""
cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness, content,
attachment_count, attachment_path, link))
print(f"更新数据库记录: {title} - {link}")
else:
# 插入新记录(包含link字段)
sql = """
INSERT INTO hwz_yaozhi (
title, dept, post_date, zihao, level, timeliness,
content, attachment_count, attachment_path,
link
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness,
content, attachment_count, attachment_path,
link))
print(f"新增数据库记录: {title} - {link}")

connection.commit()

except pymysql.Error as e:
print(f"数据库操作失败: {str(e)}")
if connection:
connection.rollback()
finally:
if connection:
connection.close()

# 下载附件
def download_attachments(web_object, title, base_url):

# 创建安全的标题目录名
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()

# 2. 限制目录名长度(Windows路径最大260字符,这里保留100字符)
safe_title = safe_title[:100] if len(safe_title) > 100 else safe_title

# 3. 去除首尾空格和点(避免创建"."或".."目录)
safe_title = safe_title.strip().strip('.')

# 附件保存路径
base_dir = r"D:\WorkSpace\RPA_project\attachments\yaozhi"
save_dir = os.path.join(base_dir, safe_title)
os.makedirs(save_dir, exist_ok=True)

# 查找页面中的所有附件链接
attachment_links = web_object.find_all_by_xpath("//a[contains(@href, '.pdf') or contains(@href, '.doc') or contains(@href, '.xls') or contains(@href, '.zip') or contains(@href, '.rar')]")

downloaded_files = []

for link_element in attachment_links:
file_url = link_element.get_attribute("href")
file_name = link_element.get_text().strip()

# 处理相对路径
if not file_url.startswith("http"):
file_url = urljoin(base_url, file_url)

# 获取文件名
if not file_name or file_name == "":
file_name = os.path.basename(file_url)
else:
ext = os.path.splitext(file_url)[1]
if not file_name.endswith(ext):
file_name += ext

# 检查文件是否已存在
file_path = os.path.join(save_dir, file_name)
if os.path.exists(file_path):
print(f"附件已存在,跳过下载: {file_name}")
continue

try:
# 下载文件
print(f"正在下载附件: {file_name}")
response = requests.get(file_url, stream=True, timeout=30)
response.raise_for_status()

with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

downloaded_files.append(file_name)
print(f"附件下载完成: {file_name}")
except Exception as e:
print(f"下载附件失败: {file_name}, 错误: {str(e)}")

# 返回本次下载的文件列表、当前目录总文件数、文件路径
total_files = len(os.listdir(save_dir)) if os.path.exists(save_dir) else 0
return downloaded_files, total_files, save_dir

# 获取页面内容
def get_page(title, link, dept, post_date):
print(f"正在获取:{title},{link}")
base_url = "/".join(link.split("/")[:3]) + "/"

# 重试配置
MAX_RETRIES = 3 # 最大重试次数
for attempt in range(MAX_RETRIES + 1): # 0,1,2,3 共4次尝试(首次+3次重试)
try:
web_object = web.create(link, 'edge', load_timeout=30)
# 成功则跳出循环
break
except Exception as e:
if attempt < MAX_RETRIES:
# 等待一段时间后重试
print(f"页面加载失败,第 {attempt+1} 次重试: {str(e)}")
sleep(2)
else:
# 最后一次尝试失败
print(f"页面加载超时,已尝试 {MAX_RETRIES+1} 次: {str(e)}")
web_object = None
break

# 初始化变量
zihao = ""
level = ""
timeliness = ""
content = ""

try:
# 获取所有信息项容器
info_items = web_object.find_all_by_xpath('//div[@class="manual"]/div[contains(@class, "content")]')

for item in info_items:
try:
# 获取span元素
span = item.find_by_xpath('./span')
if span:
span_text = span.get_text().strip()

# 提取实际值
if span_text.startswith("【"):
# 移除span文本,获取剩余文本
item_text = item.get_text().strip()
value = item_text.replace(span_text, "", 1).strip()

# 分类处理
if "发文字号" in span_text:
zihao = value
elif "效力级别" in span_text:
level = value
elif "时效" in span_text.replace(" ", ""):
timeliness = value
except:
# 忽略没有span的项
continue
except Exception as e:
print(f"提取信息项时出错: {e}")

# 提取文章内容
try:
# 尝试第一种格式:class="text"
content_div = web_object.find_by_xpath('//div[@class="text"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第二种格式:class="new_detail_content"
content_div = web_object.find_by_xpath('//div[@class="new_detail_content"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第三种格式:class="text"在div内
content_div = web_object.find_by_xpath('//div[contains(@class, "content")]//div[@class="text"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第四种格式:class="new_detail_content"在div内
content_div = web_object.find_by_xpath('//div[contains(@class, "content")]//div[@class="new_detail_content"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 最后尝试:直接获取整个manual内容
manual_div = web_object.find_by_xpath('//div[@class="manual"]')
if manual_div:
content = manual_div.get_text().strip()
except Exception as e:
print(f"提取内容时出错: {e}")
content = "无法提取内容"

# 打印结果
print(f"发文字号: {zihao}")
print(f"效力级别: {level}")
print(f"时效性: {timeliness}")
print(f"文章内容: {content}")

# 下载附件
downloaded_files, attachment_count, attachment_path = download_attachments(web_object, title, base_url)
print(f"已创建文件路径:{attachment_path}")
if downloaded_files:
print(f"成功下载 {len(downloaded_files)} 个附件,当前共有 {attachment_count} 个附件")
else:
print(f"没有新附件需要下载,当前共有 {attachment_count} 个附件")

# 存入数据库
save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link)

web_object.close()


# 查找列表
def get_list():
base_url = 'https://db.yaozh.com/policies'
max_page = 3 # 爬取3页(包括首页)
# 想爬完可以改为1289,3页爬了一个小时,爬完预计18天=、=

for page in range(1, max_page + 1):
# 构建页面URL(第1页是index.html,第2页开始是index_1.html)
url = f"{base_url}?p={page}.html"
print(f"开始爬取第 {page}/{max_page} 页")

# 获取页面并提取条目元素
web_object = web.create(url, 'edge', load_timeout=30)
elements = web_object.find_all_by_xpath("/html/body/div[7]/div[6]/div/div[2]/table/tbody/tr")

# 循环处理当前页所有条目
for element in elements:
# 标题
title = element.child_at(0).get_text().strip()
# 链接
link = "https://db.yaozh.com" + element.find_by_xpath("./th/a").get_attribute("href")
# 发布部门
dept = element.child_at(1).get_text().strip()
# 发布日期
post_date = element.child_at(2).get_text().strip()
# 爬取页面内容
get_page(title, link, dept, post_date)

web_object.close()
sleep(2) # 每页间隔2秒避免请求过快

def main(args):
get_list()

效果验证