如何优雅的爬妹子网,手把手教你

如何优雅的爬妹子网,手把手教你前言本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。作者:bihl直接上代码,哈哈!!fromurllibimportrequestimportosfromuser_agentsimportua_listi

大家好,欢迎来到IT知识分享网。

 

直接上代码,哈哈!!
from urllib import request import os from user_agents import ua_list import time import random import re import requests from lxml import etree class MeiziSpider(): def __init__(self): self.url = 'https://www.mzitu.com/all/' def get_html(self, url): headers = {'User-Agent': random.choice(ua_list)} req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read() return html # print(html) def re_func(self, re_bds, html): pattern = re.compile(re_bds, re.S) r_list = pattern.findall(html) return r_list # 获取想要的数据 - 解析一级页面 # def parse_html(self, url): # one_html = self.get_html(url).decode() # # print(one_html) # re_bds = '<p class="url">.*?<a href="(.*?)" target="_blank">(.*?)</a>' # one_list = self.re_func(re_bds, one_html) # # print(one_list) # # time.sleep(random.randint(1, 3)) # self.write_html(one_list) def parse_html(self,url): html = self.get_html(url).decode() parse_obj = etree.HTML(html) href_list = parse_obj.xpath('//div[@class="all"]/ul[@class="archives"]/li/p[@class="url"]/a/@href') print("href_list:",href_list) self.write_html(href_list) def write_html(self, href_list): for href in href_list: two_url = href print(two_url) time.sleep(random.randint(1, 3)) self.save_image(two_url) def save_image(self, two_url): headers = {'Referer': two_url, 'User-Agent': random.choice(ua_list)} print('---------two_url-----------', two_url) # 向图片链接发请求.得到bytes类型 i = 0 while True: try: img_link = two_url + '/{}'.format(i) print("img_link:", img_link) html = requests.get(url=img_link, headers=headers).text re_bds = ' <div class="main-image"><p><a href="https://www.mzitu.com/.*?" ><img ' \ 'src="(.*?)" alt="(.*?)" width=".*?" height=".*?" /></a></p>' img_html_list = self.re_func(re_bds, html) print("img_html_list", img_html_list) name = img_html_list[0][1] print("-----name:",name) direc = '/home/ubuntu/meizi/{}/'.format(name) print("direc:",direc) if not os.path.exists(direc): os.makedirs(direc) img_ = requests.get(url=img_html_list[0][0], headers=headers).content filename = direc + name + img_link.split('/')[-1] + '.jpg' # print("img_:",img_) with open(filename, 'wb') as f: f.write(img_) i += 1 except Exception as e: break if __name__ == '__main__': spider = MeiziSpider() spider.parse_html('https://www.mzitu.com/all')

大功告成。都看懂了吧。如果有不懂的,可以去小编的Python交流.裙 :一久武其而而流一思(数字的谐音)转换下可以找到了,里面有最新Python教程项目!一起交流进步吧

免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://yundeesoft.com/30080.html

(0)

相关推荐

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注

关注微信