对面的新手看过来:爬虫常用代码双手奉送!

对面的新手看过来:爬虫常用代码双手奉送!请求importrequestsurl=”www.baidu.com”resp=requests.gethtmls=resp

大家好,欢迎来到IT知识分享网。

请求

import requests

url = “www.baidu.com”

resp = requests.get(url)

htmls = resp.text

beautifulsoup系列

from bs4 import BeautifulSoup

soup = BeautifulSoup(htmls, “lxml”)

soup.find(“a”,class_=”title”,id=”t1″,attrs={“alog-action”: “qb-ask-uname”}))

soup.find(“div”).get_text()

str(soup.find(“div”).get_text()).strip()

for i in soup.find_all(“div”,limit = 5)

print(i.get_text())

正则系列

rollback({
 "response": {
 "code": "0",
 "msg": "Success",
 "dext": ""
 },
 "data": {
 "count": 3,
 "page": 1,
 "article_info": [{
 "title": "“小库里”:适应比赛是首要任务 投篮终会找到节奏",
 "url": "http:\/\/sports..com\/a\/20180704\/035378.htm",
 "time": "2018-07-04 16:58:36",
 "column": "NBA",
 "img": "",
 "desc": ""
 }, {
 "title": "首钢体育助力国家冰球集训队 中国冰球联赛年底启动",
 "url": "http:\/\/sports..com\/a\/20180704\/034698.htm",
 "time": "2018-07-04 16:34:44",
 "column": "综合体育",
 "img": "",
 "desc": ""
 }...]
 }
})
import re
# 提取这个json中的每条新闻的title、url
#(.*?)为要提取的内容,可以在正则字符串中加入.*?表示中间省略若干字符
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
 tilte = i[0]
 url = i[1]

过滤html标签,保留标签里的内容

import re

htmls = “<p>abc</p>”

dr = re.compile(r'<[^>]+>’,re.S)

htmls2 = dr.sub(”,htmls)

print(htmls2) #abc

过滤script和style标签,标签里的内容也需过滤掉

import requests

from bs4 import BeautifulSoup

url = “http://new..com/omn/20180705/20180705A0920X.html”

r = requests.get(url)

htmls = r.text

soup = BeautifulSoup(htmls, “lxml”)

for script in soup([“script”, “style”]):

script.extract()

print(soup)

日期、时间的处理

import datetime

import time

# 获取当前年月日

today = datetime.date.today()

print(today) #2018-07-05

# 获取当前时间并格式化

time_now = time.strftime(“%Y-%m-%d %H:%M:%S”,time.localtime(time.time()))

print(time_now) #2018-07-05 14:20:55

# 对某个时间戳a格式化

a =

time_a = time.strftime(“%Y-%m-%d %H:%M:%S”, time.localtime(int(a)))

print(time_a) #2017-08-14 14:20:55

#时间的计算

#明天的日期

today = datetime.date.today()

tomorrow = today + datetime.timedelta(days=1)

print(tomorrow) #2018-07-06

#三天前的时间

today = datetime.datetime.today()

tomorrow = today + datetime.timedelta(days=-3)

print(tomorrow) #2018-07-02 13:37:00.

#计算时间差

start = “2018-07-03 00:00:00”

time_now = datetime.datetime.now()

b = datetime.datetime.strptime(start,’%Y-%m-%d %H:%M:%S’)

minutes = (time_now-b).seconds/60

days = (time_now-b).days

all_minutes = days*24*60+minutes

print(minutes) #821.67

print(days) #2

print(all_minutes) #3701.64

base64编码与解码

import base64 content = "abc124我是" contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8") contents = base64.b64decode(contents_base64) url中的中文解码 import urllib url = "www.baidu.com?wb =%e8%85" result_url = urllib.parse.unquote(soup3) 

数据库操作

import pymysql conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8') cur = conn.cursor() insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s) id = 1 name = "like" age = 26 data_list = [] data = (id,name,age) # 单条插入 cur.execute(insert_sql,data) conn.commit() # 批量插入 data_list.append(data) cur.executemany(insert_sql,data_list) conn.commit() #特殊字符处理(name中含有特殊字符) data = (id,pymysql.escape_string(name),age) #更新 update_sql = "update tbl_name set content = '%s' where id = "+str(id) cur.execute(update_sql%(pymysql.escape_string(content))) conn.commit() #批量更新 update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s" update_data = (contents,title,is_spider,one_new[0]) update_data_list.append(update_data) if len(update_data_list) > 500: try: cur.executemany(update_sql,update_data_list) conn.commit() 
对面的新手看过来:爬虫常用代码双手奉送!

免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://yundeesoft.com/76393.html

(0)

相关推荐

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

关注微信