爬取简历模板
url = 'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&classID=864' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/91.0.4472.101 ' 'Safari/537.36 Edg/91.0.864.48 ' } if not os.path.exists('./简历模板'): os.mkdir('./简历模板') respones = requests.get(url=url, headers=headers) page_text = respones.text tree = etree.HTML(page_text) # 获取每个简历的下载地址 url_list = tree.xpath('//div[@id="main"]/div/div/a/@href') url_data = [] for i in url_list: all_url = 'https:' + i url_data.append(all_url) for url_a in url_data: # 遍历列表字符串 U = str(url_a) # 获得下载二级页面地址发送请求,准备获取rar压缩包地址 respones_two = requests.get(url=U, headers=headers) # 获取页面源码 page_text = respones_two.text # 打印出编码类型 print(respones_two.encoding) # xpath实例 tree = etree.HTML(page_text) # 获取下载地址rar压缩包地址 rar_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[2]/a/@href') # 获取文件名 rar_name = tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()') # 因为列表没有encode,所以遍历将列表数据变为字符串 for n in rar_name: P = str(n) # 解码 rar_name_list = P.encode('iso-8859-1').decode('utf-8') + '.rar' print(rar_name_list) # 将rar url遍历为字符串进行持久化存储 for i in rar_list: j = str(i) # 将列表中的值进行转化为字符串 respones_two = requests.get(url=j, headers=headers).content # 持久化存储到本地 with open(rar_name_list, 'wb') as fp: fp.write(respones_two)
正在入门写的有点乱,理解一下,大佬们有意见欢迎提意见
来源:freebuf.com 2021-06-17 13:53:29 by: 1627052775
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END
喜欢就支持一下吧
请登录后发表评论
注册