Python爬虫(Xpath)爬取文件 – 作者:1627052775

爬取简历模板

url = 'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&classID=864'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/91.0.4472.101 '
                  'Safari/537.36 Edg/91.0.864.48 '
}
if not os.path.exists('./简历模板'):
    os.mkdir('./简历模板')
respones = requests.get(url=url, headers=headers)
page_text = respones.text
tree = etree.HTML(page_text)
# 获取每个简历的下载地址
url_list = tree.xpath('//div[@id="main"]/div/div/a/@href')
url_data = []
for i in url_list:
    all_url = 'https:' + i
    url_data.append(all_url)

for url_a in url_data: # 遍历列表字符串
    U = str(url_a)
    # 获得下载二级页面地址发送请求,准备获取rar压缩包地址
    respones_two = requests.get(url=U, headers=headers)
    # 获取页面源码
    page_text = respones_two.text
    # 打印出编码类型
    print(respones_two.encoding)
    # xpath实例
    tree = etree.HTML(page_text)
    # 获取下载地址rar压缩包地址
    rar_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[2]/a/@href')
    # 获取文件名
    rar_name = tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')
    # 因为列表没有encode,所以遍历将列表数据变为字符串
    for n in rar_name:
        P = str(n)
        # 解码
        rar_name_list = P.encode('iso-8859-1').decode('utf-8') + '.rar'
        print(rar_name_list)
        # 将rar url遍历为字符串进行持久化存储
        for i in rar_list:
            j = str(i)  # 将列表中的值进行转化为字符串
            respones_two = requests.get(url=j, headers=headers).content  # 持久化存储到本地
            with open(rar_name_list, 'wb') as fp:
                fp.write(respones_two)

正在入门写的有点乱,理解一下,大佬们有意见欢迎提意见

来源:freebuf.com 2021-06-17 13:53:29 by: 1627052775

© 版权声明
THE END
喜欢就支持一下吧
点赞0
分享
评论 抢沙发

请登录后发表评论