response.status_code
状态码response.content
字节类型response.text
字符串类型response.headers
网页响应头response.request.headers
网页请求头response.request.headers.get('cookie')
网页请求cookieresponse.cookie
网页响应头cookie # 响应头中的set-cookieresponse.url
请求网站地址text方法与content方法的区别?
str
类型,需要根据网站响应头推测可能的编码类型,不是特别可靠实战:下载网络图片
url = "https://www.baidu.com/img/bd_logo1.png"
# 响应本身是一个图片,并且是二进制类型
response = requests.get(url).content
# 以二进制+写入的方式写入
with open("5.png","wb") as f:
f.write(response)
如果下载一个较大的资源,例如一个视频,可能需要的下载时间较长,在这个较长的下载过程中程序是不能做别的事情的(当然可以使用多任务来解决),如果在不是多任务的情况下,想要知道下载的进度,此时就可以通过类似迭代的方式下载部分资源。
r = requests.get('https://www.baidu.com', stream=True)
with open('test.html', 'wb') as f:
for chunk in r.iter_content(chunk_size=100):
f.write(chunk)
import requests
def download_video(url, save_path):
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
downloaded_size = 0
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
downloaded_size += len(chunk)
percent = (downloaded_size / total_size) * 100
print(f"下载进度: {percent:.2f}%")
print("下载完成...")
# 调用下载函数
video_url = "http://v3-web.douyinvod.com/dbb2c985b18e9a2b6089ada19767525f/659692f3/video/tos/cn/tos-cn-ve-15c001-alinc2/oQblva95AUnIpA9DeieFkQmqAIu0IgBVD2iNTA/?a=6383&ch=5&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=1161&bt=1161&cs=0&ds=4&ft=LjhJEL998xIouEkmD0P5H4eaciDXtVzxF_QEePMkzijD1Inz&mime_type=video_mp4&qs=0&rc=NTYzOGVlZmZpaDo6M2loM0BpM3VseGQ6ZnM2bjMzNGkzM0BgYDQuXi5iNS8xXmBgMDNfYSMvbmZocjQwL2RgLS1kLWFzcw%3D%3D&btag=e00038000&dy_q=1704362620&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=202401041803391CCB1AB5D5A9E107FA68"
path = "video.mp4"
download_video(video_url, path)
定义User-Agent
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
import requests
url = 'https://www.baidu.com'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
}
# 在请求头中带上User-Agent,模拟浏览器发送请求
response = requests.get(url, headers=headers)
# 打印请求头信息
print(response.request.headers)
# 响应内容
print(response.text)
我们在使用百度搜索的时候经常发现url地址中会有一个?,那么该问号后边的就是请求参数,又叫做查询字符串。如果想要做到自动搜索,就应该让发送出去的url携带参数。
示例地址:https://www.baidu.com/s?wd=python
# 1. 设置需要携带的查询字符串参数
kw = {'wd': 'java'}
# 2. 发送请求
response = requests.get('https://www.baidu.com/s', params=kw)
# 3.查看发送的URL
print(response.url)
当前查询字符串参数可以直接写到url
地址中:
import requests
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url = 'https://www.baidu.com/s?wd=python'
# url中包含了请求参数,所以此时无需params
response = requests.get(url, headers=headers)
print("请求的URL:", response.url)
print("响应内容如下:", response.content)
import requests
url = "http://www.cninfo.com.cn/new/disclosure"
# 先要将api需要的数据打包成字典发送过去
form_data = {
'column': 'szse_latest',
'pageNum': 1,
'pageSize': 30,
'sortName': '',
'sortType': '',
'clusterFlag': 'true'
}
response = requests.post(url,data=form_data)
print(response.json()) # 返回json数据类型并自动转为字典结构
通用步骤
实战演示:百度翻译抓取
import requests
url = "https://fanyi.baidu.com/basetrans"
headers = {
"Referer": "https://fanyi.baidu.com/",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/123.0.0.06",
"Cookie": 'BAIDUID=3DD414F7B59CD8E5BE699680E270E043:FG=1; BAIDUID_BFESS=3DD414F7B59CD8E5BE699680E270E043:FG=1; __bid_n=18deeb9406e1152b951114; BAIDU_WISE_UID=wapp_1709723776397_808; ZFY=1hVcX:BpfbxY633Z6ISiEDM2:ABD9JOlonfK2lbUdGMWg:C; BIDUPSID=3DD414F7B59CD8E5BE699680E270E043; PSTM=1709723776; BDUSS=1vTGMtLTR2N0xTcE5neWZKd1FzUTAzVXFPYkR6eUp1UG5TQ3V1Yjc1YXNwaVJtSVFBQUFBJCQAAAAAAAAAAAEAAABxj315eWVhcrDdyqYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKwZ~WWsGf1lcS; BDUSS_BFESS=1vTGMtLTR2N0xTcE5neWZKd1FzUTAzVXFPYkR6eUp1UG5TQ3V1Yjc1YXNwaVJtSVFBQUFBJCQAAAAAAAAAAAEAAABxj315eWVhcrDdyqYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKwZ~WWsGf1lcS; OPEN_PLATFORM_IFRAME_XH_TOKEN=2407350178_121.b0cbaf47b1bed61cb5d73aba988f3645.Y_I3KYHM__OmGBlDDIb58KgkJSurDz_jwk3Nj4O.HhKGQQ; H_PS_PSSID=40079_40298_40416_40465_40459_39661_40499_40511_40398_40446_60028_60033_60047; RT="z=1&dm=baidu.com&si=f833f3f7-fae4-4bec-8495-cb61ecdc29b0&ss=luscubd0&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ul=1kq&hd=1kx"; H_WISE_SIDS=110085_287977_288665_277936_297005_287174_298395_298193_299806_300006_299170_287168_300300_285939_299594_300088_256739_300891_298527_268434_500206_295637_301259_301558_300063_301022_301444_298483_294881_301696_302186_301810_600056_600103_301561_301292_302359_600147_302486_300254_293219_600225_297165_600339_600329_600319_600381_301240_299250_291026_301603_281879_600597_255952_299217_282466_600625_600645_600670_600702_600787_600797_600747_600851_600888_600101_600931_600800_600804_299026_292242_230969_601076_601123_600500_300820_295819_601305_601311_302069_601259_601295_285671_294446_601424_300589_601498_601481_298335_303475_601542_601596_601599; H_WISE_SIDS_BFESS=110085_287977_288665_277936_297005_287174_298395_298193_299806_300006_299170_287168_300300_285939_299594_300088_256739_300891_298527_268434_500206_295637_301259_301558_300063_301022_301444_298483_294881_301696_302186_301810_600056_600103_301561_301292_302359_600147_302486_300254_293219_600225_297165_600339_600329_600319_600381_301240_299250_291026_301603_281879_600597_255952_299217_282466_600625_600645_600670_600702_600787_600797_600747_600851_600888_600101_600931_600800_600804_299026_292242_230969_601076_601123_600500_300820_295819_601305_601311_302069_601259_601295_285671_294446_601424_300589_601498_601481_298335_303475_601542_601596_601599; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1712665543,1713079899; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1712665543,1713079899; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1713079976; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1713079976; ab_sr=1.0.1_YThlZDliZDJlOWEzOTNlYzYyNzBkM2ExODVlN2JiMjYwZjQwODE1MjI2MTc2YzdhY2FkMThlZDAwY2RhMzNiY2RlZTk5MTBlNDNlNTM2MDRkOWJkYzJjZTMzNzM0OWQwMmRmZWQ2NGMxOTBiNzhhNmRlN2RlMTI1MDA3ZTI5ZDg2NTUyZmU2YzM0NGYwMDk4Yjc2YmY2YjQxNjc0MjQ1NzEzZmIyNmUzNWM3MzQ1YmVhNjc0MmYxZmY2NTZmNTdj'
}
form_data = {
"query": "tomcat",
"from": "en",
"to": "zh",
"token": "a0c46a29ebaeca58d5cd4bbfcf5f7375",
"sign": "581376.801841"
}
response = requests.post(url, headers=headers, data=form_data).json()
print(response['dict']['symbols'][0]['parts'][0]['means'])
请求参数修改allow_redirects=False
测试代码
import requests
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
r = requests.get("http://www.baidu.com", headers=headers)
print(r.url)
# 以上代码打印结果为:https://m.baidu.com/?from=844b&vit=fps
requersts
的默认情况
默认情况下,requests
发送的请求除了方式为HEAD
之外,其余的请求例如GET
、POST
等都是能自动进行重定向的
这也就是为什么上面明明访问的是http://www.baidu.com
而打印出来之后是https://m.baidu.com/?from=844b&vit=fps
的原因
取消自动重定向
在发送请求的时候,可以通过如下的设置,取消requests
模块的自动重定向功能
requests.get(url, allow_redirects=False)
实例代码:
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/123.0.0.0"
}
url = 'http://www.baidu.com'
# allow_redirections 禁止跳转
response = requests.get(url, headers=headers, allow_redirects=False)
print(response.url)
默认情况下获取历史请求
import requests
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
response = requests.get("http://www.360buy.com", headers=headers)
print("历史请求过程信息:")
print(response.history)
for one_info in response.history:
print(one_info.status_code, one_info.url, one_info.headers)
print("\n\n最后一次的请求信息:")
print(response.status_code, response.url, response.headers)
在访问https网站时,如果证书过期了就会报错,我们可以通过verify
关闭证书认证
import requests
url = "https://chinasoftinc.com/owa"
response = requests.get(url,verify=False)
print(response)
在访问网页的时候,网页环境不可能是一成不变的,有时候访问无响应会让我们一直等待,这时候可以通过设置timeout设置网页的超时等待时间
url = "https://www.google.com/search"
try:
response = requests.get(url, timeout=3)
print(response)
except Exception as e:
print('请求超时:',e)
当一个网页无响应时,可以通过retrying模块设置多次请求。
安装命令
pip install retrying -i https://pypi.tuna.tsinghua.edu.cn/simple
作用:
retrying
模块提供的retry
模块retry
中可以传入参数stop_max_attempt_number
,让函数报错后继续重新执行,达到最大执行次数的上限,如果每次都报错,整个函数报错,如果中间有一个成功,程序继续往后执行import requests
from retrying import retry
@retry(stop_max_attempt_number=3)
def request_google():
global num
print(f"第{num}次请求")
num += 1
url = "https://www.google.com"
response = requests.get(url=url, timeout=1)
print(response)
if __name__ == '__main__':
num = 1
request_google()
当我们发送POST
请求的时候,一般会携带数据,之前在学习POST
时,可以通过给data
赋值,从而能够完成传递form
表单数据。
requests.post(url, data={"kw": "python"})
但有很多时候,要向服务器发送的是json
数据,此时应该怎么办呢?
requests.post(url, json={"kw": "python"})
在请求方法中设置json
参数即可。
实例代码:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
response = requests.post("https://fanyi.baidu.com/sug", headers=headers, json={"kw": "python"}, timeout=3)
print("请求头是:", response.request.headers)
print("请求体是:", response.request.body)
当我们在爬取某些页面的时候,服务器往往会需要cookie
,而想要得到cookie
就需要先访问某个URL
进行登录,服务器接收到请求之后验证用户名以及密码在登录成功的情况下会返回一个响应,这个响应的header
中一般会有一个set-cookie
的信息,它对应的值就是要设置的cookie
信息。
虽然我们再之前可以通过requests.utils.dict_from_cookiejar(r.cookies)
提取到这个响应信息中设置的新cookie
,但在下一个请求中再携带这个数据的过程较为麻烦,所以requests
有个高级的方式 - 会话Session
Session
的作用
Session`能够跨请求保持某些参数,也会在同一个`Session`实例发出的所有请求之间保持`cookie
会话保持有两个内涵:
cookie
,下一次请求会自动带上前一次的cookie
使用方法
# 1. 创建一个session实例对象
s = requests.Session()
# 2. 使用上一步创建的对象发起请求
r = s.get(url1, headers)
r = s.get(url2, headers)
r = s.get(url3, headers)
r = s.get(url4, headers)
session
对象在请求了一个网站后,对方服务器设置在本地的cookie
会保存在session
对象中,下一次再使用session
对象请求对方服务器的时候,会自动带上前一次的cookie
。
实例代码
import requests
session = requests.Session()
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
# 发送第一个请求
response = session.get('https://www.baidu.com', headers=headers)
print("第一次请求的请求头为:", response.request.headers)
print("响应头:", response.headers)
print("设置的cookie为:", requests.utils.dict_from_cookiejar(response.cookies))
# 发送第二个请求
response = session.get("https://www.baidu.com")
print("第二次请求的请求头为:", response.request.headers)
当在爬某个网站的时候,如果对方进行了封锁例如将我们电脑的公网ip
封锁了,那么也就意味着只要是这个ip
发送的所有请求这个网站都不会进行响应;此时我们就可以使用代理,绕过它的封锁从而实现继续爬取数据。
使用方法
import requests
# http代理
ip = "127.0.0.1"
port = 7890
# 将代理地址与端口配置成字典并使用proxies参数传递
proxies = {
"http": "http://%s:%d" % (ip, port),
"https": "http://%s:%d" % (ip, port)
}
print(proxies)
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
}
url = "http://httpbin.org/ip"
response = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
print(response.text)
怎么收藏这篇文章?