大家好,欢迎来到IT知识分享网。
完整代码 — 爬取国家粮食局历年水稻数据
import requests from lxml import etree import time #获取 urls_province # 获取源码 headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = r"https://ricedata.cn/variety/" response = requests.get(url,headers) response.encoding = "utf-8" #获取 urls_province html = etree.HTML(response.text) results = html.xpath('/html/body//tr[4]/td/div/a/@href') # 拼接 https://ricedata.cn/variety/ + result urls_province = ["https://ricedata.cn/variety/"+ result for result in results] #print(len(urls_province)) # 获取所有的额 privince_pages privince_pages = [] for url_province in urls_province: # 获取 page_urls 以农业部为例 https://ricedata.cn/variety/identified/nation_1.htm headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = url_province response = requests.get(url,headers) response.encoding = "utf-8" html_page = etree.HTML(response.text) #获取 urls_province results_page = html_page.xpath('/html/body/table[2]/caption/b/a/@href') num = (results_page[-1].split('_')[-1]).split('.')[0] num = (results_page[-1].split('_')[-1]).split('.')[0] parser = results_page[-1].split(num) privince_page = ["https://ricedata.cn/variety/identified/"+parser[0]+str(i)+parser[-1] for i in range(1,int(num)+1)] #print(privince_page) privince_pages.extend(privince_page) time.sleep(0.2) print(len(privince_pages)) # 获取地区的具体数据 data_content = [] for privince_page in privince_pages: headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } url = privince_page print(privince_page) response = requests.get(url,headers) response.encoding = "gbk" contents = etree.HTML(response.text,etree.HTMLParser()) tr_content = contents.xpath('/html/body/table[2]//tr') # /html/body/table[2]/tbody[2]/tr[1] content = [] for tr in tr_content: result = tr.xpath('./td/text()') content.append(result) data_content.extend(content) time.sleep(0.2) #print(content) print(len((data_content)))
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://yundeesoft.com/29946.html